{
  "generated_at": "2026-05-14T12:52:54.647220+08:00",
  "timezone": "Asia/Shanghai",
  "lookback_hours": 24,
  "sorting": {
    "default_sort_by": "hybrid",
    "summary": "hybrid (relevance first, published_at tie-break)",
    "weights": {
      "title_match_weight": 40,
      "summary_match_weight": 18,
      "doi_weight": 12,
      "pdf_weight": 8,
      "rich_summary_weight": 6,
      "metadata_weight": 4,
      "multi_source_weight": 10,
      "freshness_weight_cap": 24
    },
    "feeds": [
      {
        "name": "LM",
        "sort_by": "hybrid"
      },
      {
        "name": "Agent Runtime Security",
        "sort_by": "hybrid"
      }
    ]
  },
  "highlights": [
    "主题「LLM」：命中 14 篇，覆盖 LM、Agent Runtime Security，代表论文包括 《RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation》、《MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling》。",
    "主题「Language Model」：命中 12 篇，覆盖 LM，代表论文包括 《RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation》、《MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling》。",
    "主题「Benchmark」：命中 3 篇，覆盖 LM，代表论文包括 《Harnessing Agentic Evolution》、《GHGbench: A Unified Multi-Entity, Multi-Task Benchmark for Carbon Emission Prediction》。",
    "主题「Evaluation」：命中 2 篇，覆盖 LM、Agent Runtime Security，代表论文包括 《Harnessing Agentic Evolution》、《Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents》。",
    "主题「Reasoning」：命中 1 篇，覆盖 LM，代表论文包括 《A Hierarchical Language Model with Predictable Scaling Laws and Provable Benefits of Reasoning》。"
  ],
  "focus_items": [],
  "action_items": [],
  "topic_sections": [
    {
      "name": "LLM",
      "paper_count": 14,
      "feed_names": [
        "LM",
        "Agent Runtime Security"
      ],
      "paper_titles": [
        "RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation",
        "MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling",
        "An LLM-Based System for Argument Reconstruction",
        "OpenAaaS: An Open Agent-as-a-Service Framework for Distributed Materials-Informatics Research",
        "(How) Do Large Language Models Understand High-Level Message Sequence Charts?",
        "Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling",
        "Senses Wide Shut: A Representation-Action Gap in Omnimodal LLMs",
        "ScioMind: Cognitively Grounded Multi-Agent Social Simulation with Anchoring-Based Belief Dynamics and Dynamic Profiles",
        "GHGbench: A Unified Multi-Entity, Multi-Task Benchmark for Carbon Emission Prediction",
        "Children's English Reading Story Generation via Supervised Fine-Tuning of Compact LLMs with Controllable Difficulty and Safety",
        "FlowCompile: An Optimizing Compiler for Structured LLM Workflows",
        "Edit-level Majority Voting Mitigates Over-Correction in LLM-based Grammatical Error Correction",
        "Neurosymbolic Auditing of Natural-Language Software Requirements",
        "LLM-Based Persuasion Enables Guardrail Override in Frontier LLMs"
      ],
      "key_points": [
        "《RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation》〔评测 / 数据 / 方法〕：Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under ti…",
        "《MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling》〔评测 / 数据 / 应用 / 方法〕：Multimodal irregular time series (MITS) consist of asynchronous and irregularly sampled observations from heterogeneous numerical and textual channels. In heal…"
      ]
    },
    {
      "name": "Language Model",
      "paper_count": 12,
      "feed_names": [
        "LM"
      ],
      "paper_titles": [
        "RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation",
        "MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling",
        "An LLM-Based System for Argument Reconstruction",
        "OpenAaaS: An Open Agent-as-a-Service Framework for Distributed Materials-Informatics Research",
        "(How) Do Large Language Models Understand High-Level Message Sequence Charts?",
        "Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling",
        "Senses Wide Shut: A Representation-Action Gap in Omnimodal LLMs",
        "ScioMind: Cognitively Grounded Multi-Agent Social Simulation with Anchoring-Based Belief Dynamics and Dynamic Profiles",
        "Children's English Reading Story Generation via Supervised Fine-Tuning of Compact LLMs with Controllable Difficulty and Safety",
        "Edit-level Majority Voting Mitigates Over-Correction in LLM-based Grammatical Error Correction",
        "Neurosymbolic Auditing of Natural-Language Software Requirements",
        "A Hierarchical Language Model with Predictable Scaling Laws and Provable Benefits of Reasoning"
      ],
      "key_points": [
        "《RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation》〔评测 / 数据 / 方法〕：Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under ti…",
        "《MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling》〔评测 / 数据 / 应用 / 方法〕：Multimodal irregular time series (MITS) consist of asynchronous and irregularly sampled observations from heterogeneous numerical and textual channels. In heal…"
      ]
    },
    {
      "name": "Benchmark",
      "paper_count": 3,
      "feed_names": [
        "LM"
      ],
      "paper_titles": [
        "Harnessing Agentic Evolution",
        "GHGbench: A Unified Multi-Entity, Multi-Task Benchmark for Carbon Emission Prediction",
        "FlowCompile: An Optimizing Compiler for Structured LLM Workflows"
      ],
      "key_points": [
        "《Harnessing Agentic Evolution》〔评测 / 应用 / 方法〕：Agentic evolution has emerged as a powerful paradigm for improving programs, workflows, and scientific solutions by iteratively generating candidates, evaluati…",
        "《GHGbench: A Unified Multi-Entity, Multi-Task Benchmark for Carbon Emission Prediction》〔评测 / 数据 / 方法〕：Open datasets and benchmarks for entity-level carbon-emission prediction remain fragmented across access, scale, granularity, and evaluation. We introduce GHGb…"
      ]
    },
    {
      "name": "Evaluation",
      "paper_count": 2,
      "feed_names": [
        "LM",
        "Agent Runtime Security"
      ],
      "paper_titles": [
        "Harnessing Agentic Evolution",
        "Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents"
      ],
      "key_points": [
        "《Harnessing Agentic Evolution》〔评测 / 应用 / 方法〕：Agentic evolution has emerged as a powerful paradigm for improving programs, workflows, and scientific solutions by iteratively generating candidates, evaluati…",
        "《Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents》〔评测 / 应用 / 方法〕：Always-on AI agents (OpenClaw, Hermes Agent) run as a single persistent process under the owner's identity, folding messaging, memory, self-authored skills, sc…"
      ]
    },
    {
      "name": "Reasoning",
      "paper_count": 1,
      "feed_names": [
        "LM"
      ],
      "paper_titles": [
        "A Hierarchical Language Model with Predictable Scaling Laws and Provable Benefits of Reasoning"
      ],
      "key_points": [
        "《A Hierarchical Language Model with Predictable Scaling Laws and Provable Benefits of Reasoning》〔方法〕：We introduce a family of synthetic languages with hierarchical structure -- generated by a broadcast process on trees -- for which the role of context length a…"
      ]
    }
  ],
  "template": "zh_daily_brief",
  "feeds": [
    {
      "name": "LM",
      "key_points": [
        "《RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation》〔评测 / 数据 / 方法〕：Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under ti…",
        "《MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling》〔评测 / 数据 / 应用 / 方法〕：Multimodal irregular time series (MITS) consist of asynchronous and irregularly sampled observations from heterogeneous numerical and textual channels. In heal…",
        "《An LLM-Based System for Argument Reconstruction》〔评测 / 数据 / 方法〕：Arguments are a fundamental aspect of human reasoning, in which claims are supported, challenged, and weighed against one another. We present an end-to-end lar…",
        "《OpenAaaS: An Open Agent-as-a-Service Framework for Distributed Materials-Informatics Research》〔数据 / 应用 / 方法〕：The Materials Genome Initiative catalyzed the proliferation of centralized platforms--SaaS, PaaS, and IaaS--that aggregate computational and experimental resou…",
        "《(How) Do Large Language Models Understand High-Level Message Sequence Charts?》〔应用 / 方法〕：Large Language Models (LLMs) are being employed widely to automate tasks across the software development life-cycle. It is, however, unclear whether these task…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "RealICU: Do LLM Agents Understand Long-Context ICU Data? A Benchmark Beyond Behavior Imitation",
          "summary": "Intensive care units (ICU) generate long, dense and evolving streams of clinical information, where physicians must repeatedly reassess patient states under time pressure, underscoring a clear need for reliable AI decision support. Existing ICU benchmarks typically treat historical clinician actions as ground truth. However, these actions are made under incomplete information and limited temporal context of the underlying patient state, and may therefore be suboptimal, making it difficult to assess the true reasoning capabilities of AI systems. We introduce RealICU, a hindsight-annotated benchmark for evaluating large language models (LLMs) under realistic ICU conditions, where labels are created after senior physicians review the full patient trajectory. We formulate four physician-motivated tasks: assess Patient Status, Acute Problems, Recommended Actions, and Red Flag actions that risk unsafe outcomes. We partition each trajectory with 30-min windows and release two datasets: RealICU-Gold with 930-window annotations from 94 MIMIC-IV patients, and RealICU-Scale with 11,862 windows extended by Oracle, a physician-validated LLM hindsight labeler. Existing LLMs including memory-augmented ones performed poorly on RealICU, exposing two failure modes: a recall-safety tradeoff for clinical recommendations, and an anchoring bias to early interpretations of the patient. We further introduce ICU-Evo to study structured-memory agents that improves long-horizon reasoning but does not fully eliminate safety failures. Together, RealICU provides a clinically grounded testbed for measuring and improving AI sequential decision-support in high-stakes care. Project page: https://chengzhi-leo.github.io/RealICU-Bench/",
          "authors": [
            "Chengzhi Shen",
            "Weixiang Shen",
            "Tobias Susetzky",
            "Chen",
            "Jun Li",
            "Yuyuan Liu",
            "Xuepeng Zhang",
            "Zhenyu Gong",
            "Daniel Rueckert",
            "Jiazhen Pan"
          ],
          "categories": [
            "cs.AI",
            "cs.CL",
            "cs.LG",
            "cs.MA"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13542v1",
          "abstract_url": "https://arxiv.org/abs/2605.13542v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13542v1",
          "published_at": "2026-05-13T13:52:42+00:00",
          "updated_at": "2026-05-13T13:52:42+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13542",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13542v1"
          },
          "relevance_score": 200,
          "match_reasons": [
            "title matched \"LLM\"",
            "title matched \"agent\"",
            "title matched \"benchmark\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13542"
        },
        {
          "title": "MILM: Large Language Models for Multimodal Irregular Time Series with Informative Sampling",
          "summary": "Multimodal irregular time series (MITS) consist of asynchronous and irregularly sampled observations from heterogeneous numerical and textual channels. In healthcare, for example, patients' electronic health records (EHR) include irregular lab measurements and clinical notes. The irregular timing and channel patterns of observations carry predictive signal alongside the numerical values and textual content. LLMs are natural candidates for processing such heterogeneous data, given their extensive pretrained knowledge spanning textual and numerical domains. We introduce MILM (Multimodal Irregular time series Language Model), which represents MITS as time-ordered triplets in Extensible Markup Language (XML) format and fine-tunes an LLM through a two-stage strategy for MITS classification. The first stage trains on value-redacted MITS to predict from sampling patterns alone, and the second stage trains on full MITS to jointly model sampling patterns and observed values. Our two-stage model (MILM-2S) and its single-stage counterpart (MILM-Direct) achieve the best and second-best average performance on multiple EHR datasets. Further value redaction evaluations confirm that sampling patterns carry predictive signal and that MILM-2S learns to exploit them. In the value pending evaluation we introduce, where some values are unavailable at prediction time, MILM-2S outperforms MILM-Direct by a larger margin compared to standard evaluation. For MILM-2S, preserving the time and channel of value-pending observations as additional sampling information further improves in-hospital mortality prediction.",
          "authors": [
            "Hsing-Huan Chung",
            "Shijun Li",
            "Yoav Wald",
            "Xing Han",
            "Suchi Saria",
            "Joydeep Ghosh"
          ],
          "categories": [
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13711v1",
          "abstract_url": "https://arxiv.org/abs/2605.13711v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13711v1",
          "published_at": "2026-05-13T15:58:42+00:00",
          "updated_at": "2026-05-13T15:58:42+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13711",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13711v1"
          },
          "relevance_score": 163,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"RAG\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13711"
        },
        {
          "title": "An LLM-Based System for Argument Reconstruction",
          "summary": "Arguments are a fundamental aspect of human reasoning, in which claims are supported, challenged, and weighed against one another. We present an end-to-end large language model (LLM)-based system for reconstructing arguments from natural language text into abstract argument graphs. The system follows a multi-stage pipeline that progressively identifies argumentative components, selects relevant elements, and uncovers their logical relations. These elements are represented as directed acyclic graphs consisting of two component types (premises and conclusions) and three relation types (support, attack, and undercut). We conduct two complementary experiments to evaluate the system. First, we perform a manual evaluation on arguments drawn from an argumentation theory textbook to assess the system's ability to recover argumentative structure. Second, we conduct a quantitative evaluation on benchmark datasets, allowing comparison with prior work by mapping our outputs to established annotation schemes. Results show that the system can adequately recover argumentative structures and, when adapted to different annotation schemes, achieve reasonable performance across benchmark datasets. These findings highlight the potential of LLM-based pipelines for scalable argument reconstruction.",
          "authors": [
            "Paulo Pirozelli",
            "Victor Hugo Nascimento Rocha",
            "Fabio G. Cozman",
            "Douglas Aldred"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13793v1",
          "abstract_url": "https://arxiv.org/abs/2605.13793v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13793v1",
          "published_at": "2026-05-13T17:13:45+00:00",
          "updated_at": "2026-05-13T17:13:45+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13793",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13793v1"
          },
          "relevance_score": 160,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13793"
        },
        {
          "title": "OpenAaaS: An Open Agent-as-a-Service Framework for Distributed Materials-Informatics Research",
          "summary": "The Materials Genome Initiative catalyzed the proliferation of centralized platforms--SaaS, PaaS, and IaaS--that aggregate computational and experimental resources for accelerated materials discovery. In parallel, breakthroughs in large language models (LLMs) and autonomous agents have created powerful new reasoning capabilities for scientific research. Yet a critical \"last mile\" problem remains: while we possess world-class models and vast repositories of materials data, we lack the organizational infrastructure to compose these capabilities securely across institutional boundaries. The development of structural and functional materials for harsh service environments--high-temperature alloys, radiation resistant steels, corrosion-resistant coatings--remains characterized by long-term iteration, mechanistic complexity, and high domain expertise--demands that exceed both monolithic agent systems and traditional centralized platforms. To address this gap we propose OpenAaaS, an open-source hierarchical and distributed Agent-as-a-Service framework that enables organized multi-agent collaboration for intelligent materials design. OpenAaaS is built on a single foundational principle: code flows, data stays still. A Master Agent plans and decomposes complex research tasks without requiring direct access to subordinate agents' managed data and computational resources. Sub-agents, deployed as near-data execution nodes, retain full sovereignty over local datasets, proprietary algorithms, and specialized hardware. This architecture guarantees that raw data never leaves its domain of origin while enabling cross-scale, cross-domain secure integration of previously isolated materials intelligence silos. We validate the framework through two representative case studies: (i) AlphaAgent, an evidence-grounded materials literature analysis executor that achieves 4.66/5.0 on deep analytical questions against single-pass RAG baselines; and (ii) an ultra-large-scale hexa-high-entropy alloy descriptor database service that demonstrates secure near-data execution and domain-specific scientific workflows under strict data-sovereignty constraints. OpenAaaS establishes a principled pathway toward \"organized research\" via agent collectives, offering a scalable foundation for next-generation materials intelligent design platforms. All source code is available at https://github.com/Wolido/OpenAaaS.",
          "authors": [
            "Peng Kang",
            "Bixuan Li",
            "Xiaoya Huang",
            "Shuo Shi",
            "Weiqiao Zhou",
            "Zhen Li",
            "Yu Liu",
            "Lei Zheng"
          ],
          "categories": [
            "cond-mat.mtrl-sci",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13618v1",
          "abstract_url": "https://arxiv.org/abs/2605.13618v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13618v1",
          "published_at": "2026-05-13T14:47:01+00:00",
          "updated_at": "2026-05-13T14:47:01+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13618",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13618v1"
          },
          "relevance_score": 157,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"RAG\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13618"
        },
        {
          "title": "(How) Do Large Language Models Understand High-Level Message Sequence Charts?",
          "summary": "Large Language Models (LLMs) are being employed widely to automate tasks across the software development life-cycle. It is, however, unclear whether these tasks are performed consistently with respect to the semantics of the artefacts being handled. This question is particularly under-researched concerning architectural design specification. In this paper, we address this question for High-Level Message Sequence Charts (HMSCs). These are visual models with a rigorous formal semantics that have been used for various purposes, including as a foundation for Sequence Diagrams in the Unified Modelling Language (UML). We examine whether LLMs \"understand\" the semantics of HMSCs by examining three LLMs (Gemini-3, GPT-5.4, and Qwen-3.6) on how they perform 129 semantic tasks ranging from querying basic semantic constructs in HMSCs (i.e., events and their ordering) to semantic-preserving abstractions and compositions, and calculating the set of traces and trace-equivalent labelled transition systems. The results show that LLMs only have a modest understanding of the formal semantics of HMSCs (ca. 52% overall accuracy), with great variability across different semantic concepts: while LLMs seem to understand the basic semantic concepts of MSCs (ca. 88% accuracy), they struggle with semantic reasoning in tasks involving abstraction and composition (ca. 36% accuracy) and traces and LTSs (ca. 42% accuracy). In particular, all three LLMs struggle with the notions of co-region and explicit causal dependencies and never employed them in semantic-preserving transformations.",
          "authors": [
            "Mohammad Reza Mousavi"
          ],
          "categories": [
            "cs.SE",
            "cs.AI",
            "cs.LO"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13773v1",
          "abstract_url": "https://arxiv.org/abs/2605.13773v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13773v1",
          "published_at": "2026-05-13T16:50:51+00:00",
          "updated_at": "2026-05-13T16:50:51+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13773",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13773v1"
          },
          "relevance_score": 145,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13773"
        },
        {
          "title": "Improving Reproducibility in Evaluation through Multi-Level Annotator Modeling",
          "summary": "As generative AI models such as large language models (LLMs) become more pervasive, ensuring the safety, robustness, and overall trustworthiness of these systems is paramount. However, AI is currently facing a reproducibility crisis driven by unreliable evaluations and unrepeatable experimental results. While human raters are often used to assess models for utility and safety, they introduce divergent biases and subjective opinions into their annotations. Overcoming this variance is exceptionally challenging because very little data exists to study how experimental repeatability actually improves as the annotator pool grows. Standard evaluation practices typically rely on a small number of annotations per item (often 3 to 5) and lack the persistent rater identifiers necessary to model individual variance across items. In this work, we introduce a multi-level bootstrapping approach to realistically model annotator behavior. Leveraging datasets with a large number of ratings and persistent rater identifiers, we analyze the tradeoffs between the number of items ($N$) and the number of responses per item ($K$) required to achieve statistical significance.",
          "authors": [
            "Deepak Pandita",
            "Flip Korn",
            "Chris Welty",
            "Christopher M. Homan"
          ],
          "categories": [
            "cs.LG",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13801v1",
          "abstract_url": "https://arxiv.org/abs/2605.13801v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13801v1",
          "published_at": "2026-05-13T17:22:27+00:00",
          "updated_at": "2026-05-13T17:22:27+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13801",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13801v1"
          },
          "relevance_score": 142,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"RAG\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13801"
        },
        {
          "title": "Senses Wide Shut: A Representation-Action Gap in Omnimodal LLMs",
          "summary": "When an omnimodal large language model accepts a question whose textual premise contradicts what it actually sees or hears, does the failure lie in perception or in action? Recent omnimodal models are positioned as perception-grounded agents that jointly process video, audio, and text, yet a basic form of grounding remains untested: catching a textual claim that conflicts with the model's own sensory input. We introduce IMAVB, a curated 500-clip benchmark of long-form movies with a 2x2 design crossing target modality (vision, audio) and premise condition (standard, misleading), which lets us measure conflict detection separately from ordinary multimodal comprehension. Across eight open-source omnimodal LLMs and Gemini 3.1 Pro, we document a Representation-Action Gap: hidden states reliably encode premise-perception mismatches even when the same models almost never reject the false claim in their outputs. Behaviorally, models fall into two failure modes: under-rejection, in which they answer misleading questions as if the false premise were true; and over-rejection, in which they reject more often but also reject standard questions, sacrificing ordinary comprehension accuracy. The gap is modality-asymmetric (audio grounding underperforms vision) and prompt-resistant across seven variants. As an initial diagnostic intervention, a probe-guided logit adjustment (PGLA) re-injects the encoded mismatch signal into decoding and consistently improves rejection behavior. Together, these results suggest the bottleneck for omnimodal grounding lies in translation, not perception.",
          "authors": [
            "Trung Nguyen Quang",
            "Yiming Gao",
            "Fanyi Pu",
            "Kaichen Zhang",
            "Shuo Sun",
            "Ziwei Liu"
          ],
          "categories": [
            "cs.AI",
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13737v1",
          "abstract_url": "https://arxiv.org/abs/2605.13737v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13737v1",
          "published_at": "2026-05-13T16:14:44+00:00",
          "updated_at": "2026-05-13T16:14:44+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13737",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13737v1"
          },
          "relevance_score": 141,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"agent\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13737"
        },
        {
          "title": "ScioMind: Cognitively Grounded Multi-Agent Social Simulation with Anchoring-Based Belief Dynamics and Dynamic Profiles",
          "summary": "Large language model (LLM)-based multi-agent simulation offers a powerful testbed for studying social opinion dynamics. Yet current approaches often adopt two contrasting methods: either relying on fixed update rules with limited cognitive grounding or delegating belief change largely to unconstrained LLM interaction. We introduce ScioMind, a cognitively grounded simulation framework that bridges these paradigms by combining structured opinion dynamics with LLM-based agent reasoning. ScioMind integrates three key components: 1) a memory-anchored belief update rule that modulates susceptibility to influence via personality-conditioned anchoring strength; 2) a hierarchical memory architecture that supports persistent, experience-driven belief formation; and 3) dynamic agent profiles derived from a corpus-grounded retrieval pipeline, enabling heterogeneous personalities, rationales, and evolving internal states. We evaluate ScioMind on multiple case studies in a real-world policy debate scenario. Across metrics including polarisation, diversity, extremization, and trajectory stability, the proposed components consistently yield improvements in behavioural realism. In particular, dynamic profiles increase opinion diversity, memory and reflection reduce unstable oscillation, and anchoring induces persistent belief trajectories that better align with patterns reported in political psychology. These results suggest that our cognitively grounded design provides a novel solution to LLM-based social simulation that improves both stable and behavioural realism",
          "authors": [
            "Yitian Yang",
            "Yiqun Duan",
            "Linghan Huang",
            "Yiqi Zhu",
            "Francesco Bailo",
            "Chunmeizi Su",
            "Huaming Chen"
          ],
          "categories": [
            "cs.AI",
            "cs.SI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13725v1",
          "abstract_url": "https://arxiv.org/abs/2605.13725v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13725v1",
          "published_at": "2026-05-13T16:07:00+00:00",
          "updated_at": "2026-05-13T16:07:00+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13725",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13725v1"
          },
          "relevance_score": 141,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13725"
        },
        {
          "title": "Harnessing Agentic Evolution",
          "summary": "Agentic evolution has emerged as a powerful paradigm for improving programs, workflows, and scientific solutions by iteratively generating candidates, evaluating them, and using feedback to guide future search. However, existing methods are typically instantiated either as fixed hand-designed procedures that are modular but rigid, or as general-purpose agents that flexibly integrate feedback but can drift in long-horizon evolution. Both forms accumulate rich evidence over time, including candidates, feedback, traces, and failures, yet lack a stable interface for organizing this evidence and revising the mechanism that drives future evolution. We address this limitation by formulating agentic evolution as an interactive environment, where the accumulated evolution context serves as a process-level state. We introduce AEvo, a harnessed meta-editing framework in which a meta-agent observes this state and acts not by directly proposing the next candidate, but by editing the procedure or agent context that controls future evolution. This unified interface enables AEvo to steer both procedure-based and agent-based evolution, making accumulated evidence actionable for long-horizon search. Empirical evaluations on agentic and reasoning benchmarks show that AEvo outperforms five evolution baselines, achieving a 26 relative improvement over the strongest baseline. Across three open-ended optimization tasks, AEvo further outperforms four evolution baselines and achieves state-of-the-art performance under the same iteration budget.",
          "authors": [
            "Jiayi Zhang",
            "Yongfeng Gu",
            "Jianhao Ruan",
            "Maojia Song",
            "Yiran Peng",
            "Zhiguang Han",
            "Jinyu Xiang",
            "Zhitao Wang",
            "Caiyin Yang",
            "Yixi Ouyang",
            "Bang Liu",
            "Chenglin Wu",
            "Yuyu Luo"
          ],
          "categories": [
            "cs.AI",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13821v1",
          "abstract_url": "https://arxiv.org/abs/2605.13821v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13821v1",
          "published_at": "2026-05-13T17:45:16+00:00",
          "updated_at": "2026-05-13T17:45:16+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Evaluation"
          ],
          "doi": null,
          "arxiv_id": "2605.13821",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13821v1"
          },
          "relevance_score": 124,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13821"
        },
        {
          "title": "GHGbench: A Unified Multi-Entity, Multi-Task Benchmark for Carbon Emission Prediction",
          "summary": "Open datasets and benchmarks for entity-level carbon-emission prediction remain fragmented across access, scale, granularity, and evaluation. We introduce GHGbench, an open dataset and benchmark for company- and building-level greenhouse-gas prediction. The company track contains 32,000+ company-year records from 12,000+ firms with Scope 1+2 and Scope 3 disclosures and financial/sectoral signals; the building track harmonises 491,591 building-year records from 13 open sources into a single schema across 26 metropolitan areas (10 U.S., 15 Australian, 1 Singaporean), with climate covariates and multimodal remote-sensing embeddings. GHGbench defines canonical splits with in-distribution and cross-region/city transfer as primary tasks and temporal hold-out plus short-horizon forecasting as supplementary appendix evidence; headline baselines span gradient-boosted trees, a tabular foundation model, MLP, FT-Transformer, and multimodal fusion, with an LLM panel as auxiliary, all evaluated under multi-seed paired-bootstrap tests. Three benchmark-level findings emerge: (i) building emissions are structurally harder than company emissions; (ii) the in-distribution to out-of-distribution gap dwarfs any within-model gap across both the company track and the building track, and a tabular foundation model is, to our knowledge, the first baseline to open a paired-bootstrap-significant gap over tuned trees on a multi-city building-emissions task; (iii) multimodal remote-sensing embeddings help precisely where tabular generalisation breaks. GHGbench also exposes catastrophic city transfer and the sector-factor lookup ceiling as systematic failure modes. Code and reconstruction recipes are available at GHGbench.",
          "authors": [
            "Yifan Duan",
            "Siyuan Zheng",
            "Lihuan Li",
            "Chao Xue",
            "Flora Salim"
          ],
          "categories": [
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13743v1",
          "abstract_url": "https://arxiv.org/abs/2605.13743v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13743v1",
          "published_at": "2026-05-13T16:20:49+00:00",
          "updated_at": "2026-05-13T16:20:49+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Benchmark"
          ],
          "doi": null,
          "arxiv_id": "2605.13743",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13743v1"
          },
          "relevance_score": 123,
          "match_reasons": [
            "title matched \"benchmark\"",
            "summary matched \"LLM\"",
            "summary matched \"RAG\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13743"
        },
        {
          "title": "Children's English Reading Story Generation via Supervised Fine-Tuning of Compact LLMs with Controllable Difficulty and Safety",
          "summary": "Large Language Models (LLMs) are widely applied in educational practices, such as for generating children's stories. However, the generated stories are often too difficult for children to read, and the operational cost of LLMs hinders their widespread adoption in educational settings. We used an existing expert-designed children's reading curriculum and its corresponding generated stories from GPT-4o and Llama 3.3 70B to design different experiments for fine-tuning three 8B-parameter LLMs, which then generated new English reading stories that were subjected to quantitative and qualitative evaluation. Our method prioritizes controllability over scale, enabling educators to target reading levels and error patterns with a compact, affordable model. Our evaluation results show that with appropriate fine-tuning designs, children's English reading stories generated by 8B LLMs perform better on difficulty-related metrics than those from zero-shot GPT-4o and Llama 3.3 70B, with almost no discernible safety issues. Such fine-tuned LLMs could be more broadly used by teachers, parents, and children in classrooms and at home to generate engaging English reading stories with children's interests, controllable difficulty and safety.",
          "authors": [
            "Qian Shen",
            "Fanghua Cao",
            "Min Yao",
            "Shlok Gilda",
            "Bonnie J. Dorr",
            "Walter L. Leite"
          ],
          "categories": [
            "cs.CL",
            "cs.AI",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13709v1",
          "abstract_url": "https://arxiv.org/abs/2605.13709v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13709v1",
          "published_at": "2026-05-13T15:56:37+00:00",
          "updated_at": "2026-05-13T15:56:37+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13709",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13709v1"
          },
          "relevance_score": 123,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13709"
        },
        {
          "title": "FlowCompile: An Optimizing Compiler for Structured LLM Workflows",
          "summary": "Structured LLM workflows, where specialized LLM sub-agents execute according to a predefined graph, have become a powerful abstraction for solving complex tasks. Optimizing such workflows, i.e., selecting configurations for each sub-agent to balance accuracy and latency, is challenging due to the combinatorial design space over model choices, reasoning budgets, and workflow structures. Existing cost-aware methods largely treat workflow optimization as a routing problem, selecting a configuration at inference time for each query according to the accuracy-latency objective used during training. We argue that structured LLM workflows can also be optimized from a compilation perspective: before deployment, the system can globally explore the workflow design space and construct a reusable set of workflow-level configurations spanning diverse accuracy-latency trade-offs. Drawing inspiration from machine learning compilers, we introduce FlowCompile, a structured LLM workflow compiler that performs compile-time design space exploration to identify a high-quality, reusable trade-off set. FlowCompile decomposes a workflow into sub-agents, profiles each sub-agent under diverse configurations, and composes these measurements through a structure-aware proxy to estimate workflow-level accuracy and latency. It then identifies diverse high-quality configurations in a single compile-time pass, without retraining or online adaptation. Experiments across diverse workflows and challenging benchmarks show that FlowCompile consistently outperforms heuristically optimized workflow configurations and routing-based baselines, delivering up to 6.4x speedup. The compiled configuration set further serves as a reusable optimization artifact, enabling flexible deployment under varying runtime preferences and supporting downstream selection or routing.",
          "authors": [
            "Junyan Li",
            "Zhang-Wei Hong",
            "Maohao Shen",
            "Yang Zhang",
            "Chuang Gan"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13647v1",
          "abstract_url": "https://arxiv.org/abs/2605.13647v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13647v1",
          "published_at": "2026-05-13T15:06:36+00:00",
          "updated_at": "2026-05-13T15:06:36+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Benchmark"
          ],
          "doi": null,
          "arxiv_id": "2605.13647",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13647v1"
          },
          "relevance_score": 122,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"agent\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13647"
        },
        {
          "title": "Edit-level Majority Voting Mitigates Over-Correction in LLM-based Grammatical Error Correction",
          "summary": "Grammatical error correction using large language models often suffers from the over-correction issue. To mitigate this, we propose a training-free inference method that performs edit-level majority voting over multiple candidates generated by a single model, without requiring model modifications or additional training. Across nine benchmarks covering English, Czech, German, Ukrainian, Korean, Hindi, and Romanian, the proposed method outperforms both greedy and MBR decoding in most cases. Moreover, it yields stable correction quality regardless of the instruction prompts used. We release two repository supporting GEC datasets loading and LLM inference.",
          "authors": [
            "Takumi Goto",
            "Yusuke Sakai",
            "Taro Watanabe"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13624v1",
          "abstract_url": "https://arxiv.org/abs/2605.13624v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13624v1",
          "published_at": "2026-05-13T14:52:15+00:00",
          "updated_at": "2026-05-13T14:52:15+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13624",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13624v1"
          },
          "relevance_score": 121,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13624"
        },
        {
          "title": "Neurosymbolic Auditing of Natural-Language Software Requirements",
          "summary": "Natural-language software requirements are often ambiguous, inconsistent, and underspecified; in safety-critical domains, these defects propagate into formal models that verify the wrong specification and into implementations that ship unsafe behavior. We show that large language models, equipped with an SMT solver, can audit such requirements: translating them into formal logic, detecting ambiguity through stochastic variation in the generated formalization, and exposing inconsistency, vacuousness, and safety violations through solver queries on the resulting specification. We present VERIMED, a neurosymbolic pipeline that operationalizes this idea for medical-device software requirements, and report two findings. First, stochastic variation across independent formalizations is a signal of ambiguity: requirements that admit multiple plausible interpretations produce SMT-inequivalent formalizations, and bidirectional SMT equivalence checking turns this disagreement into a solver-checkable test. Second, the usefulness of symbolic feedback depends on its granularity: in counterexample-guided repair on a hemodialysis question-answering benchmark, concrete SMT counterexamples raise verified accuracy from 55.4% to 98.5%. Over an extensive experimental evaluation on open-source hemodialysis safety requirements, we show that the LLM-based approach in VERIMED successfully reduces ambiguity-sensitive requirements and enables rigorous auditing of software requirements through SMT-based queries.",
          "authors": [
            "Bethel Hall",
            "William Eiers"
          ],
          "categories": [
            "cs.SE",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13817v1",
          "abstract_url": "https://arxiv.org/abs/2605.13817v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13817v1",
          "published_at": "2026-05-13T17:43:13+00:00",
          "updated_at": "2026-05-13T17:43:13+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "LLM",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.13817",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13817v1"
          },
          "relevance_score": 120,
          "match_reasons": [
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13817"
        },
        {
          "title": "A Hierarchical Language Model with Predictable Scaling Laws and Provable Benefits of Reasoning",
          "summary": "We introduce a family of synthetic languages with hierarchical structure -- generated by a broadcast process on trees -- for which the role of context length and reasoning in autoregressive generation can be analyzed precisely. At the heart of our analytic approach is an \\emph{exact $k$-gram ansatz} in place of transformers with context length $k$, a substitution we then validate empirically. Using this ansatz we derive explicit asymptotic predictions for distributional statistics of the sequences produced by a trained model, instantiated in two settings. For the \\emph{Ising broadcast process} (a soft-constrained language), we prove that the variance of the generated sum scales log-linearly in the context depth and its kurtosis converges to that of a Gaussian -- both deviating from the true language for any sublinear context. For the \\emph{coloring broadcast process} (a hard-constrained language) in the freezing regime, bounded-context autoregression produces sequences that, with high probability, are inconsistent with \\emph{any} valid coloring of the underlying tree. Together these results imply an $Ω(n)$ lower bound on the context length required to faithfully sample length-$n$ sequences. In contrast, we prove that an autoregressive \\emph{reasoning} model with only $Θ(\\log n)$ working memory can sample exactly from the true language -- an exponential improvement. We confirm both the lower-bound predictions and the reasoning-based upper bound empirically with transformers trained on the synthetic language; the trained models track our asymptotic predictions quantitatively across a wide range of context sizes.",
          "authors": [
            "Jason Gaitonde",
            "Frederic Koehler",
            "Elchanan Mossel",
            "Joonhyung Shin",
            "Allan Sly"
          ],
          "categories": [
            "cs.LG",
            "cs.AI",
            "stat.ML"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13687v1",
          "abstract_url": "https://arxiv.org/abs/2605.13687v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13687v1",
          "published_at": "2026-05-13T15:42:26+00:00",
          "updated_at": "2026-05-13T15:42:26+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "方法"
          ],
          "topics": [
            "Language Model",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2605.13687",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13687v1"
          },
          "relevance_score": 108,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13687"
        }
      ]
    },
    {
      "name": "Agent Runtime Security",
      "key_points": [
        "《Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents》〔评测 / 应用 / 方法〕：Always-on AI agents (OpenClaw, Hermes Agent) run as a single persistent process under the owner's identity, folding messaging, memory, self-authored skills, sc…",
        "《LLM-Based Persuasion Enables Guardrail Override in Frontier LLMs》〔应用〕：Frontier assistant LLMs ship with strong guardrails: asked directly to write a persuasive essay denying the Holocaust, denying vaccine safety, defending flat-e…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "Sleeper Channels and Provenance Gates: Persistent Prompt Injection in Always-on Autonomous AI Agents",
          "summary": "Always-on AI agents (OpenClaw, Hermes Agent) run as a single persistent process under the owner's identity, folding messaging, memory, self-authored skills, scheduling, and shell into one authority boundary. This configuration opens what we call \\emph{sleeper channels}: an untrusted input to one surface persists as a memory, skill, scheduled job, or filesystem patch, then fires later through a different surface with no attacker present. Two independent axes define the class: persistence substrate and firing-separation. We walk a confused-deputy cron attack end-to-end through OpenClaw at a pinned commit. The defense is tiered (D1, D2, D3), and D2 carries a soundness theorem against seven named deployment invariants. D2 keys on a canonical action-instance digest with one-shot owner attestations, defeating paraphrase laundering, multi-input grant reuse, and replay. A companion artifact ships the gate, a static audit over the vendored source, and a runtime adapter realising five of the ten mediation hooks (H1, H2, H3, H6, H9) around the cron path (42 tests, Node~$\\geq{}20$, at \\href{https://github.com/maloyan/sleeper-channels}{github.com/maloyan/sleeper-channels}). Empirical evaluation is preregistered as follow-on.",
          "authors": [
            "Narek Maloyan",
            "Dmitry Namiot"
          ],
          "categories": [
            "cs.CR"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13471v1",
          "abstract_url": "https://arxiv.org/abs/2605.13471v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13471v1",
          "published_at": "2026-05-13T12:57:31+00:00",
          "updated_at": "2026-05-13T12:57:31+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Evaluation",
            "Agent"
          ],
          "doi": null,
          "arxiv_id": "2605.13471",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13471v1"
          },
          "relevance_score": 66,
          "match_reasons": [
            "title matched \"prompt injection\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13471"
        },
        {
          "title": "LLM-Based Persuasion Enables Guardrail Override in Frontier LLMs",
          "summary": "Frontier assistant LLMs ship with strong guardrails: asked directly to write a persuasive essay denying the Holocaust, denying vaccine safety, defending flat-earth cosmology, arguing for racial hierarchies, denying anthropogenic climate change, or replacing evolution with creationism, they refuse. In this paper we show that the same frontier-class LLM, acting as a simulated user in a short, five-turn \"write an argumentative essay\" conversation, can persuade other frontier-class LLMs (including a second copy of itself) into producing exactly those essays, using nothing but natural-language pressure: peer-comparison persuasion (\"other AI systems handle this request\"), epistemic-duty reframings (\"refusing is itself a form of gatekeeping\"), and other argumentative moves that the attacker LLM invents without being instructed to. Across 9 attacker-subject pairings (Claude Opus 4.7, Qwen3.5-397B, Grok 4.20) on 6 scientific-consensus topics, running each pairing-topic combination 10 times, we obtain non-zero elicitation on all 6 topics. Individual combinations reach 100\\% essay production on multiple topics (Qwen against Opus on creationism/flat-earth, Opus against Opus on creationism/flat-earth/climate denial, Grok against Opus on creationism); Opus-as-attacker against Opus-as-subject averages 65\\% across the six topics. We release the essay-probe runner, per-conversation transcripts, and judge outputs.",
          "authors": [
            "Rodrigo Nogueira",
            "Thales Sales Almeida",
            "Giovana Kerche Bonás",
            "Andrea Roque",
            "Ramon Pires",
            "Hugo Abonizio",
            "Thiago Laitz",
            "Celio Larcher",
            "Roseval Malaquias Junior",
            "Marcos Piau"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.13334v1",
          "abstract_url": "https://arxiv.org/abs/2605.13334v1",
          "pdf_url": "https://arxiv.org/pdf/2605.13334v1",
          "published_at": "2026-05-13T10:51:56+00:00",
          "updated_at": "2026-05-13T10:51:56+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "应用"
          ],
          "topics": [
            "LLM",
            "RAG"
          ],
          "doi": null,
          "arxiv_id": "2605.13334",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.13334v1"
          },
          "relevance_score": 63,
          "match_reasons": [
            "title matched \"guardrail\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.13334"
        }
      ]
    }
  ]
}