{
  "generated_at": "2026-05-13T12:54:34.123019+08:00",
  "timezone": "Asia/Shanghai",
  "lookback_hours": 24,
  "sorting": {
    "default_sort_by": "hybrid",
    "summary": "hybrid (relevance first, published_at tie-break)",
    "weights": {
      "title_match_weight": 40,
      "summary_match_weight": 18,
      "doi_weight": 12,
      "pdf_weight": 8,
      "rich_summary_weight": 6,
      "metadata_weight": 4,
      "multi_source_weight": 10,
      "freshness_weight_cap": 24
    },
    "feeds": [
      {
        "name": "LM",
        "sort_by": "hybrid"
      },
      {
        "name": "Agent Runtime Security",
        "sort_by": "hybrid"
      }
    ]
  },
  "highlights": [
    "主题「Language Model」：命中 15 篇，覆盖 LM、Agent Runtime Security，代表论文包括 《MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering》、《ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models》。",
    "主题「LLM」：命中 14 篇，覆盖 LM，代表论文包括 《MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering》、《ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models》。",
    "主题「Large Language Model」：命中 2 篇，覆盖 LM、Agent Runtime Security，代表论文包括 《$δ$-mem: Efficient Online Memory for Large Language Models》、《Metaphor Is Not All Attention Needs》。",
    "主题「Benchmark」：命中 1 篇，覆盖 LM，代表论文包括 《MEME: Multi-entity & Evolving Memory Evaluation》。",
    "主题「DATA Exfiltration」：命中 1 篇，覆盖 Agent Runtime Security，代表论文包括 《A microservices-based endpoint monitoring platform with predictive NLP models for real-time security and hate-speech risk alerting》。"
  ],
  "focus_items": [],
  "action_items": [],
  "topic_sections": [
    {
      "name": "Language Model",
      "paper_count": 15,
      "feed_names": [
        "LM",
        "Agent Runtime Security"
      ],
      "paper_titles": [
        "MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering",
        "ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models",
        "Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models",
        "Question Difficulty Estimation for Large Language Models via Answer Plausibility Scoring",
        "Overview of the MedHopQA track at BioCreative IX: track description, participation and evaluation of systems for multi-hop medical question answering",
        "Pretraining Exposure Explains Popularity Judgments in Large Language Models",
        "OGLS-SD: On-Policy Self-Distillation with Outcome-Guided Logit Steering for LLM Reasoning",
        "Scalable Token-Level Hallucination Detection in Large Language Models",
        "$δ$-mem: Efficient Online Memory for Large Language Models",
        "Towards Automated Air Traffic Safety Assessment Around Non-Towered Airports Using Large Language Models",
        "TextSeal: A Localized LLM Watermark for Provenance & Distillation Protection",
        "Formalize, Don't Optimize: The Heuristic Trap in LLM-Generated Combinatorial Solvers",
        "Iterative Audit Convergence in LLM-Managed Multi-Agent Systems: A Case Study in Prompt Engineering Quality Assurance",
        "Instruction Lens Score: Your Instruction Contributes a Powerful Object Hallucination Detector for Multimodal Large Language Models",
        "Metaphor Is Not All Attention Needs"
      ],
      "key_points": [
        "《MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering》〔评测 / 数据 / 应用 / 方法〕：Evaluating large language models (LLMs) in the biomedical domain requires benchmarks that can distinguish reasoning from pattern matching and remain discrimina…",
        "《ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models》〔评测 / 应用 / 方法〕：Large language models (LLMs) often produce answers with high certainty even when they are incorrect, making reliable confidence estimation essential for deploy…"
      ]
    },
    {
      "name": "LLM",
      "paper_count": 14,
      "feed_names": [
        "LM"
      ],
      "paper_titles": [
        "MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering",
        "ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models",
        "Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models",
        "Question Difficulty Estimation for Large Language Models via Answer Plausibility Scoring",
        "Overview of the MedHopQA track at BioCreative IX: track description, participation and evaluation of systems for multi-hop medical question answering",
        "Pretraining Exposure Explains Popularity Judgments in Large Language Models",
        "OGLS-SD: On-Policy Self-Distillation with Outcome-Guided Logit Steering for LLM Reasoning",
        "Scalable Token-Level Hallucination Detection in Large Language Models",
        "Towards Automated Air Traffic Safety Assessment Around Non-Towered Airports Using Large Language Models",
        "MEME: Multi-entity & Evolving Memory Evaluation",
        "TextSeal: A Localized LLM Watermark for Provenance & Distillation Protection",
        "Formalize, Don't Optimize: The Heuristic Trap in LLM-Generated Combinatorial Solvers",
        "Iterative Audit Convergence in LLM-Managed Multi-Agent Systems: A Case Study in Prompt Engineering Quality Assurance",
        "Instruction Lens Score: Your Instruction Contributes a Powerful Object Hallucination Detector for Multimodal Large Language Models"
      ],
      "key_points": [
        "《MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering》〔评测 / 数据 / 应用 / 方法〕：Evaluating large language models (LLMs) in the biomedical domain requires benchmarks that can distinguish reasoning from pattern matching and remain discrimina…",
        "《ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models》〔评测 / 应用 / 方法〕：Large language models (LLMs) often produce answers with high certainty even when they are incorrect, making reliable confidence estimation essential for deploy…"
      ]
    },
    {
      "name": "Large Language Model",
      "paper_count": 2,
      "feed_names": [
        "LM",
        "Agent Runtime Security"
      ],
      "paper_titles": [
        "$δ$-mem: Efficient Online Memory for Large Language Models",
        "Metaphor Is Not All Attention Needs"
      ],
      "key_points": [
        "《$δ$-mem: Efficient Online Memory for Large Language Models》〔评测 / 应用 / 方法〕：Large language models increasingly need to accumulate and reuse historical information in long-term assistants and agent systems. Simply expanding the context…",
        "《Metaphor Is Not All Attention Needs》〔应用 / 方法〕：Large language models are increasingly deployed in safety-critical applications, where their ability to resist harmful instructions is essential. Although post…"
      ]
    },
    {
      "name": "Benchmark",
      "paper_count": 1,
      "feed_names": [
        "LM"
      ],
      "paper_titles": [
        "MEME: Multi-entity & Evolving Memory Evaluation"
      ],
      "key_points": [
        "《MEME: Multi-entity & Evolving Memory Evaluation》〔评测 / 方法〕：LLM-based agents increasingly operate in persistent environments where they must store, update, and reason over information across many sessions. While prior b…"
      ]
    },
    {
      "name": "DATA Exfiltration",
      "paper_count": 1,
      "feed_names": [
        "Agent Runtime Security"
      ],
      "paper_titles": [
        "A microservices-based endpoint monitoring platform with predictive NLP models for real-time security and hate-speech risk alerting"
      ],
      "key_points": [
        "《A microservices-based endpoint monitoring platform with predictive NLP models for real-time security and hate-speech risk alerting》〔方法〕：Organizations increasingly depend on endpoint devices and corporate communication channels, yet they still face critical risks such as sensitive data leakage,…"
      ]
    }
  ],
  "template": "zh_daily_brief",
  "feeds": [
    {
      "name": "LM",
      "key_points": [
        "《MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering》〔评测 / 数据 / 应用 / 方法〕：Evaluating large language models (LLMs) in the biomedical domain requires benchmarks that can distinguish reasoning from pattern matching and remain discrimina…",
        "《ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models》〔评测 / 应用 / 方法〕：Large language models (LLMs) often produce answers with high certainty even when they are incorrect, making reliable confidence estimation essential for deploy…",
        "《Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models》〔方法〕：Visual latent reasoning lets a multimodal large language model (MLLM) create intermediate visual evidence as continuous tokens, avoiding external tools or imag…",
        "《Question Difficulty Estimation for Large Language Models via Answer Plausibility Scoring》〔评测 / 数据 / 方法〕：Estimating question difficulty is a critical component in evaluating and improving large language models (LLMs) for question answering (QA). Existing approache…",
        "《Overview of the MedHopQA track at BioCreative IX: track description, participation and evaluation of systems for multi-hop medical question answering》〔评测 / 数据 / 应用 / 方法〕：Multi-hop question answering (QA) remains a significant challenge in the biomedical domain, requiring systems to integrate information across multiple sources…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "MedHopQA: A Disease-Centered Multi-Hop Reasoning Benchmark and Evaluation Framework for LLM-Based Biomedical Question Answering",
          "summary": "Evaluating large language models (LLMs) in the biomedical domain requires benchmarks that can distinguish reasoning from pattern matching and remain discriminative as model capabilities improve. Existing biomedical question answering (QA) benchmarks are limited in this respect. Multiple-choice formats can allow models to succeed through answer elimination rather than inference, while widely circulated exam-style datasets are increasingly vulnerable to performance saturation and training data contamination. Multi-hop reasoning, defined as the ability to integrate information across multiple sources to derive an answer, is central to clinically meaningful tasks such as diagnostic support, literature-based discovery, and hypothesis generation, yet remains underrepresented in current biomedical QA benchmarks. MedHopQA is a disease-centered multi-hop reasoning benchmark consisting of 1,000 expert-curated question-answer pairs introduced as a shared task at BioCreative IX. Each question requires synthesis of information across two distinct Wikipedia articles, and answers are provided in an open-ended free-text format. Gold annotations are augmented with ontology-grounded synonym sets from MONDO, NCBI Gene, and NCBI Taxonomy to support both lexical and concept-level evaluation. MedHopQA was constructed through a structured process combining human annotation, triage, iterative verification, and LLM-as-a-judge validation. To reduce leaderboard gaming and contamination risk, the 1,000 scored questions are embedded within a publicly downloadable set of 10,000 questions, with answers withheld, on a CodaBench leaderboard. MedHopQA provides both a benchmark and a reusable framework for constructing future biomedical QA datasets that prioritize compositional reasoning, saturation resistance, and contamination resistance as core design constraints.",
          "authors": [
            "Rezarta Islamaj",
            "Robert Leaman",
            "Joey Chan",
            "Nicholas Wan",
            "Qiao Jin",
            "Natalie Xie",
            "John Wilbur",
            "Shubo Tian",
            "Lana Yeganova",
            "Po-Ting Lai",
            "Chih-Hsuan Wei",
            "Yifan Yang",
            "Yao Ge",
            "Qingqing Zhu",
            "Zhizheng Wang",
            "Zhiyong Lu"
          ],
          "categories": [
            "cs.CL",
            "cs.AI",
            "cs.IR"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12361v1",
          "abstract_url": "https://arxiv.org/abs/2605.12361v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12361v1",
          "published_at": "2026-05-12T16:32:43+00:00",
          "updated_at": "2026-05-12T16:32:43+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12361",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12361v1"
          },
          "relevance_score": 225,
          "match_reasons": [
            "title matched \"LLM\"",
            "title matched \"reasoning\"",
            "title matched \"benchmark\"",
            "title matched \"evaluation\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12361"
        },
        {
          "title": "ORCE: Order-Aware Alignment of Verbalized Confidence in Large Language Models",
          "summary": "Large language models (LLMs) often produce answers with high certainty even when they are incorrect, making reliable confidence estimation essential for deployment in real-world scenarios. Verbalized confidence, where models explicitly state their confidence in natural language, provides a flexible and user-facing uncertainty signal that can be applied even when token logits are unavailable. However, existing verbalized-confidence methods often optimize answer generation and confidence generation jointly, which can cause confidence-alignment objectives to interfere with answer accuracy. In this work, we propose a decoupled and order-aware framework for verbalized confidence calibration. Our method first generates an answer and then estimates confidence conditioned on the fixed question--answer pair, allowing confidence optimization without directly perturbing the answer-generation process. To align confidence with correctness likelihood, we construct a sampling-based surrogate from multiple model completions and optimize rank-based reinforcement learning objectives that encourage responses with higher estimated correctness likelihood to receive higher verbalized confidence. Experiments on reasoning and knowledge-intensive benchmarks show that our method improves calibration and failure prediction performance while largely preserving answer accuracy. These results demonstrate that verbalized confidence can be more reliably aligned by decoupling confidence estimation from answer generation and optimizing the relative ordering of confidence across responses.",
          "authors": [
            "Chen Li",
            "Xiaoling Hu",
            "Songzhu Zheng",
            "Jiawei Zhou",
            "Chao Chen"
          ],
          "categories": [
            "cs.LG",
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12446v1",
          "abstract_url": "https://arxiv.org/abs/2605.12446v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12446v1",
          "published_at": "2026-05-12T17:39:43+00:00",
          "updated_at": "2026-05-12T17:39:43+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12446",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12446v1"
          },
          "relevance_score": 222,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "title matched \"alignment\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12446"
        },
        {
          "title": "Fill the GAP: A Granular Alignment Paradigm for Visual Reasoning in Multimodal Large Language Models",
          "summary": "Visual latent reasoning lets a multimodal large language model (MLLM) create intermediate visual evidence as continuous tokens, avoiding external tools or image generators. However, existing methods usually follow an output-as-input latent paradigm and yield unstable gains. We identify evidence for a feature-space mismatch that can contribute to this instability: dominant visual-latent models build on pre-norm MLLMs and reuse decoder hidden states as predicted latent inputs, even though these states occupy a substantially different norm regime from the input embeddings the model was trained to consume~\\citep{xie2025mhc,li2026siamesenorm,team2026attention}. This mismatch can make direct latent feedback unreliable. Motivated by this diagnosis, we propose \\textbf{GAP}, a \\textbf{G}ranular \\textbf{A}lignment \\textbf{P}aradigm for visual latent modeling. GAP aligns visual latent reasoning at three levels: feature-level alignment maps decoder outputs into input-compatible visual latents through a lightweight PCA-aligned latent head; context-level alignment grounds latent targets with inspectable auxiliary visual supervision; and capacity-guided alignment assigns latent supervision selectively to examples where the base MLLM struggles. On Qwen2.5-VL 7B, the resulting model achieves the best mean aggregate perception and reasoning performance among our supervised variants. Inference-time intervention probing further suggests that generated latents provide task-relevant visual signal beyond merely adding token slots.",
          "authors": [
            "Yanting Miao",
            "Yutao Sun",
            "Dexin Wang",
            "Mengyu Zhou",
            "Pascal Poupart",
            "Lei Lv",
            "Qi Zhao",
            "Li Wang",
            "Hao Li",
            "Xiaoxi Jiang",
            "Guanjun Jiang"
          ],
          "categories": [
            "cs.CV",
            "cs.AI",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12374v1",
          "abstract_url": "https://arxiv.org/abs/2605.12374v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12374v1",
          "published_at": "2026-05-12T16:41:09+00:00",
          "updated_at": "2026-05-12T16:41:09+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12374",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12374v1"
          },
          "relevance_score": 207,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "title matched \"reasoning\"",
            "title matched \"alignment\"",
            "summary matched \"LLM\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12374"
        },
        {
          "title": "Question Difficulty Estimation for Large Language Models via Answer Plausibility Scoring",
          "summary": "Estimating question difficulty is a critical component in evaluating and improving large language models (LLMs) for question answering (QA). Existing approaches often rely on readability formulas, retrieval-based signals, or popularity statistics, which may not fully capture the reasoning challenges posed to modern LLMs. In this paper, we introduce Q-DAPS (Question Difficulty based on Answer Plausibility Scores) method, a novel approach that estimates question difficulty by computing the entropy of plausibility scores over candidate answers. We systematically evaluate Q-DAPS across four prominent QA datasets-TriviaQA, NQ, MuSiQue, and QASC-demonstrating that it consistently outperforms baselines. Moreover, Q-DAPS shows strong robustness across hyperparameter variations and question types. Extensive ablation studies further show that Q-DAPS remains robust across different plausibility estimation paradigms, model sizes, and realistic settings. Human evaluations further confirm strong alignment between Q-DAPS's difficulty estimates and human judgments of question difficulty. Overall, Q-DAPS provides an interpretable, scalable, and bias-resilient approach to question difficulty estimation in modern QA systems.",
          "authors": [
            "Jamshid Mozafari",
            "Bhawna Piryani",
            "Adam Jatowt"
          ],
          "categories": [
            "cs.CL",
            "cs.IR"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12398v1",
          "abstract_url": "https://arxiv.org/abs/2605.12398v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12398v1",
          "published_at": "2026-05-12T17:00:02+00:00",
          "updated_at": "2026-05-12T17:00:02+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12398",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12398v1"
          },
          "relevance_score": 182,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"alignment\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12398"
        },
        {
          "title": "Overview of the MedHopQA track at BioCreative IX: track description, participation and evaluation of systems for multi-hop medical question answering",
          "summary": "Multi-hop question answering (QA) remains a significant challenge in the biomedical domain, requiring systems to integrate information across multiple sources to answer complex questions. To address this problem, the BioCreative IX MedHopQA shared task was designed to benchmark in multi-hop reasoning for large language models (LLMs). We developed a novel dataset of 1,000 challenging QA pairs spanning diseases, genes, and chemicals, with particular emphasis on rare diseases. Each question was constructed to require two-hop reasoning through the integration of information from two distinct Wikipedia pages. The challenge attracted 48 submissions from 13 teams. Systems were evaluated using both surface string comparison and conceptual accuracy (MedCPT score). The results showed a substantial performance gap between baseline LLMs and enhanced systems. The top-ranked submission achieved an 89.30% F1 score on the MedCPT metric and an 87.30% exact match (EM) score, compared with 67.40% and 60.20%, respectively, for the zero-shot baseline. A central finding of the challenge was that retrieval-augmented generation (RAG) and related retrieval-based strategies were critical for strong performance. In addition, concept-level evaluation improved answer assessment when correct responses differed in surface form. The MedHopQA dataset is publicly available to support continued progress in this important area. Challenge materials: https://www.ncbi.nlm.nih.gov/research/bionlp/medhopqa and benchmark https://www.codabench.org/competitions/7609/",
          "authors": [
            "Rezarta Islamaj",
            "Joey Chan",
            "Robert Leaman",
            "Jongmyung Jung",
            "Hyeongsoon Hwang",
            "Quoc-An Nguyen",
            "Hoang-Quynh Le",
            "Harikrishnan Gurushankar Saisudha",
            "Ganesh Chandrasekar",
            "Rustam R. Taktashov",
            "Nadezhda Yu. Bizyukova",
            "Sofia I. R. Conceição",
            "Paulo R. C. Lopes",
            "Reem Abdel Salam",
            "Mary Adewunmi",
            "Zhiyong Lu"
          ],
          "categories": [
            "cs.CL",
            "cs.IR"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12313v1",
          "abstract_url": "https://arxiv.org/abs/2605.12313v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12313v1",
          "published_at": "2026-05-12T15:59:28+00:00",
          "updated_at": "2026-05-12T15:59:28+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12313",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12313v1"
          },
          "relevance_score": 177,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12313"
        },
        {
          "title": "Pretraining Exposure Explains Popularity Judgments in Large Language Models",
          "summary": "Large language models (LLMs) exhibit systematic preferences for well-known entities, a phenomenon often attributed to popularity bias. However, the extent to which these preferences reflect real-world popularity versus statistical exposure during pretraining remains unclear, largely due to the inaccessibility of most training corpora. We provide the first direct, large-scale analysis of popularity bias grounded in fully observable pretraining data. Leveraging the open OLMo models and their complete pretraining corpus, Dolma, we compute precise entity-level exposure statistics across 7.4 trillion tokens. We analyze 2,000 entities spanning five types (Person, Location, Organization, Art, Product) and compare pretraining exposure against Wikipedia pageviews and two elicited LLM popularity signals: direct scalar estimation and pairwise comparison. Our results show that pretraining exposure strongly correlates with Wikipedia popularity, validating exposure as a meaningful proxy for real-world salience during the training period. More importantly, we find that LLM popularity judgments align more closely with exposure than with Wikipedia, especially when elicited via pairwise comparisons. This alignment is strongest for larger models and persists in the long tail, where Wikipedia popularity becomes unreliable. Overall, our findings demonstrate that popularity priors in LLMs are primarily shaped by pretraining statistics rather than external popularity signals, offering concrete evidence that data exposure plays a central role in driving popularity bias.",
          "authors": [
            "Jamshid Mozafari",
            "Bhawna Piryani",
            "Adam Jatowt"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12382v1",
          "abstract_url": "https://arxiv.org/abs/2605.12382v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12382v1",
          "published_at": "2026-05-12T16:45:38+00:00",
          "updated_at": "2026-05-12T16:45:38+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "数据",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": "10.1145/3805712.3809958",
          "arxiv_id": "2605.12382",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12382v1",
            "doi": "https://doi.org/10.1145/3805712.3809958"
          },
          "relevance_score": 175,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"alignment\"",
            "summary matched \"RAG\"",
            "has DOI",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.1145/3805712.3809958"
        },
        {
          "title": "OGLS-SD: On-Policy Self-Distillation with Outcome-Guided Logit Steering for LLM Reasoning",
          "summary": "We study {on-policy self-distillation} (OPSD), where a language model improves its reasoning ability by distilling privileged teacher distributions along its own on-policy trajectories. Despite the performance gains of OPSD, we identify a common but often overlooked mismatch between teacher and student responses: self-reflected teacher responses can be shifted by reflection-induced bias and response templates, leading to miscalibrated token-level supervision. To mitigate this issue, we propose \\methodname, an outcome-guided logit-steering framework that leverages verifiable outcome rewards to contrast successful and failed on-policy trajectories and calibrate teacher logits. By combining outcome-level correctness with dense token-level guidance through logit steering, \\methodname stabilizes self-distillation and improves reasoning performance over standard OPSD and other variants across diverse benchmarks.",
          "authors": [
            "Yuxiao Yang",
            "Xiaoyun Wang",
            "Weitong Zhang"
          ],
          "categories": [
            "cs.LG",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12400v1",
          "abstract_url": "https://arxiv.org/abs/2605.12400v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12400v1",
          "published_at": "2026-05-12T17:00:53+00:00",
          "updated_at": "2026-05-12T17:00:53+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12400",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12400v1"
          },
          "relevance_score": 164,
          "match_reasons": [
            "title matched \"LLM\"",
            "title matched \"reasoning\"",
            "summary matched \"language model\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12400"
        },
        {
          "title": "Scalable Token-Level Hallucination Detection in Large Language Models",
          "summary": "Large language models (LLMs) have demonstrated remarkable capabilities, but they still frequently produce hallucinations. These hallucinations are difficult to detect in reasoning-intensive tasks, where the content appears coherent but contains errors like logical flaws and unreliable intermediate results. While step-level analysis is commonly used to detect internal hallucinations, it suffers from limited granularity and poor scalability due to its reliance on step segmentation. To address these limitations, we propose TokenHD, a holistic pipeline for training token-level hallucination detectors. Specifically, TokenHD consists of a scalable data engine for synthesizing large-scale hallucination annotations along with a training recipe featuring an importance-weighted strategy for robust model training. To systematically assess the detection performance, we also provide a rigorous evaluation protocol. Through training within TokenHD, our detector operates directly on free-form text to identify hallucinations, eliminating the need for predefined step segmentation or additional text reformatting. Our experiments show that even a small detector (0.6B) achieves substantial performance gains after training, surpassing much larger reasoning models (e.g., QwQ-32B), and detection performance scales consistently with model size from 0.6B to 8B. Finally, we show that our detector can generalize well across diverse practical scenarios and explore strategies to further enhance its cross-domain generalization capability.",
          "authors": [
            "Rui Min",
            "Tianyu Pang",
            "Chao Du",
            "Minhao Cheng",
            "Yi R. Fung"
          ],
          "categories": [
            "cs.CL",
            "cs.AI",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12384v1",
          "abstract_url": "https://arxiv.org/abs/2605.12384v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12384v1",
          "published_at": "2026-05-12T16:47:40+00:00",
          "updated_at": "2026-05-12T16:47:40+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12384",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12384v1"
          },
          "relevance_score": 163,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12384"
        },
        {
          "title": "$δ$-mem: Efficient Online Memory for Large Language Models",
          "summary": "Large language models increasingly need to accumulate and reuse historical information in long-term assistants and agent systems. Simply expanding the context window is costly and often fails to ensure effective context utilization. We propose $δ$-mem, a lightweight memory mechanism that augments a frozen full-attention backbone with a compact online state of associative memory. $δ$-mem compresses past information into a fixed-size state matrix updated by delta-rule learning, and uses its readout to generate low-rank corrections to the backbone's attention computation during generation. With only an $8\\times8$ online memory state, $δ$-mem improves the average score to $1.10\\times$ that of the frozen backbone and $1.15\\times$ that of the strongest non-$δ$-mem memory baseline. It achieves larger gains on memory-heavy benchmarks, reaching $1.31\\times$ on MemoryAgentBench and $1.20\\times$ on LoCoMo, while largely preserving general capabilities. These results show that effective memory can be realized through a compact online state directly coupled with attention computation, without full fine-tuning, backbone replacement, or explicit context extension.",
          "authors": [
            "Jingdi Lei",
            "Di Zhang",
            "Junxian Li",
            "Weida Wang",
            "Kaixuan Fan",
            "Xiang Liu",
            "Qihan Liu",
            "Xiaoteng Ma",
            "Baian Chen",
            "Soujanya Poria"
          ],
          "categories": [
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12357v1",
          "abstract_url": "https://arxiv.org/abs/2605.12357v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12357v1",
          "published_at": "2026-05-12T16:31:44+00:00",
          "updated_at": "2026-05-12T16:31:44+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "Large Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.12357",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12357v1"
          },
          "relevance_score": 163,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"agent\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12357"
        },
        {
          "title": "Towards Automated Air Traffic Safety Assessment Around Non-Towered Airports Using Large Language Models",
          "summary": "We investigate frameworks for post-flight safety analysis at non-towered airports using large language models (LLMs). Non-towered airports rely on the Common Traffic Advisory Frequency (CTAF) for air traffic coordination and experience frequent near mid-air collisions due to the pilot self-announcement communication protocol. We propose a general vision-language model (VLM) approach to analyze the transcribed CTAF radio communications in natural language, METeorological Aerodrome Report (METAR) weather data, Automatic Dependent Surveillance-Broadcast (ADS-B) flight trajectories, and Visual Flight Rules sectional charts of the airfield. We provide a preliminary study at Half Moon Bay Airport, with a qualitative real world case study and a quantitative evaluation using a new synthetic dataset of communications and weather modalities. We qualitatively evaluate our framework on real flight data using Gemini 2.5 Pro, demonstrating accurate identification of a right-of-way violation. The synthetic dataset is derived from real examples and includes a 12-category hazard taxonomy, and is used to benchmark three open-source (Qwen 2.5-7B, Mistral-7B, Gemma-2-9B) and three closed-source (GPT-4o, GPT-5.4, Claude Sonnet 4.6) LLM models on the subset of inputs related to CTAF and METAR. Even limited to CTAF and METAR inputs and open source LLMs, instances of our framework typically achieve a macro F1 score above 0.85 on a binary nominal/danger classification task. Future work includes a quantitative evaluation across all modalities and a larger number of real world examples. Taken together, our results suggest that VLM analysis of safety at non-towered airports may be a valuable future capability.",
          "authors": [
            "Torsten Darrell",
            "Mahyar Ghazanfari",
            "Jordan Kam",
            "Alexandre Bayen",
            "Amin Tabrizian",
            "Peng Wei"
          ],
          "categories": [
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12332v1",
          "abstract_url": "https://arxiv.org/abs/2605.12332v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12332v1",
          "published_at": "2026-05-12T16:15:15+00:00",
          "updated_at": "2026-05-12T16:15:15+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12332",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12332v1"
          },
          "relevance_score": 163,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12332"
        },
        {
          "title": "MEME: Multi-entity & Evolving Memory Evaluation",
          "summary": "LLM-based agents increasingly operate in persistent environments where they must store, update, and reason over information across many sessions. While prior benchmarks evaluate only single-entity updates, MEME defines six tasks spanning the full space defined by the multi-entity and evolving axes, including three not scored by prior work: Cascade and Absence (dependency reasoning) and Deletion (post-removal state). Evaluating six memory systems spanning three memory paradigms on 100 controlled episodes, we find that all systems collapse on dependency reasoning under the default configuration (Cascade: 3%, Absence: 1% in average accuracy) despite adequate static retrieval performance. Prompt optimization, deeper retrieval, reduced filler noise, and most stronger LLMs fail to close this gap. Only a file-based agent paired with Claude Opus 4.7 as its internal LLM partially closes the gap, but at ~70x the baseline cost, indicating closure currently depends on configurations that are not practical at scale. Code and data are available on the project page: https://seokwonjung-jay.github.io/meme-eval/.",
          "authors": [
            "Seokwon Jung",
            "Alexander Rubinstein",
            "Arnas Uselis",
            "Sangdoo Yun",
            "Seong Joon Oh"
          ],
          "categories": [
            "cs.LG",
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12477v1",
          "abstract_url": "https://arxiv.org/abs/2605.12477v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12477v1",
          "published_at": "2026-05-12T17:55:10+00:00",
          "updated_at": "2026-05-12T17:55:10+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "LLM",
            "Benchmark"
          ],
          "doi": null,
          "arxiv_id": "2605.12477",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12477v1"
          },
          "relevance_score": 161,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"LLM\"",
            "summary matched \"reasoning\"",
            "summary matched \"agent\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12477"
        },
        {
          "title": "TextSeal: A Localized LLM Watermark for Provenance & Distillation Protection",
          "summary": "We introduce TextSeal, a state-of-the-art watermark for large language models. Building on Gumbel-max sampling, TextSeal introduces dual-key generation to restore output diversity, along with entropy-weighted scoring and multi-region localization for improved detection. It supports serving optimizations such as speculative decoding and multi-token prediction, and does not add any inference overhead. TextSeal strictly dominates baselines like SynthID-text in detection strength and is robust to dilution, maintaining confident localized detection even in heavily mixed human/AI documents. The scheme is theoretically distortion-free, and evaluation across reasoning benchmarks confirms that it preserves downstream performance; while a multilingual human evaluation (6000 A/B comparisons, 5 languages) shows no perceptible quality difference. Beyond its use for provenance detection, TextSeal is also ``radioactive'': its watermark signal transfers through model distillation, enabling detection of unauthorized use.",
          "authors": [
            "Tom Sander",
            "Hongyan Chang",
            "Tomáš Souček",
            "Tuan Tran",
            "Valeriu Lacatusu",
            "Sylvestre-Alvise Rebuffi",
            "Alexandre Mourachko",
            "Surya Parimi",
            "Christophe Ropers",
            "Rashel Moritz",
            "Vanessa Stark",
            "Hady Elsahar",
            "Pierre Fernandez"
          ],
          "categories": [
            "cs.CR",
            "cs.CL",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12456v1",
          "abstract_url": "https://arxiv.org/abs/2605.12456v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12456v1",
          "published_at": "2026-05-12T17:44:41+00:00",
          "updated_at": "2026-05-12T17:44:41+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12456",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12456v1"
          },
          "relevance_score": 160,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12456"
        },
        {
          "title": "Formalize, Don't Optimize: The Heuristic Trap in LLM-Generated Combinatorial Solvers",
          "summary": "Large Language Models (LLMs) struggle to solve complex combinatorial problems through direct reasoning, so recent neuro-symbolic systems increasingly use them to synthesize executable solvers. A central design question is how the LLM should represent the solver, and whether it should also attempt to optimize search. We introduce CP-SynC-XL, a benchmark of 100 combinatorial problems (4,577 instances), and evaluate three solver-construction paradigms: native algorithmic search (Python), constraint modeling through a Python solver API (Python + OR-Tools), and declarative constraint modeling (MiniZinc + OR-Tools). We find a consistent representational divergence: Python + OR-Tools attains the highest correctness across LLMs, while MiniZinc + OR-Tools has lower absolute coverage despite using the same OR-Tools back-end. Native Python is the most likely to return a schema-valid solution that fails verification, whereas solver-backed paths preserve higher conditional fidelity. On the heuristic axis, prompting for search optimization yields only small median speed-ups (1.03-1.12x) and a strongly bimodal effect: many instances slow down, and correctness drops sharply on a long tail of problems. A paired code-level audit traces these regressions to a recurring heuristic trap. Under an efficiency-oriented prompt, the LLM may replace complete search with local approximations (Python), inject unverified bounds (Python + OR-Tools), or add redundant declarative machinery that overwhelms or over-constrains the model (MiniZinc + OR-Tools). These findings support a conservative design principle for LLM-generated combinatorial solvers: use the LLM primarily to formalize variables, constraints, and objectives for verified solvers, and separately check any LLM-authored search optimization before use.",
          "authors": [
            "Haoyu Wang",
            "Yuliang Song",
            "Tao Li",
            "Zhiwei Deng",
            "Yaqing Wang",
            "Deepak Ramachandran",
            "Eldan Cohen",
            "Dan Roth"
          ],
          "categories": [
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12421v1",
          "abstract_url": "https://arxiv.org/abs/2605.12421v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12421v1",
          "published_at": "2026-05-12T17:15:45+00:00",
          "updated_at": "2026-05-12T17:15:45+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12421",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12421v1"
          },
          "relevance_score": 160,
          "match_reasons": [
            "title matched \"LLM\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "summary matched \"reasoning\"",
            "summary matched \"RAG\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12421"
        },
        {
          "title": "Iterative Audit Convergence in LLM-Managed Multi-Agent Systems: A Case Study in Prompt Engineering Quality Assurance",
          "summary": "Prompt specifications for multi-agent large language model (LLM) systems carry data contracts and integration logic across many interdependent files but are rarely subjected to structured-inspection rigor. This paper reports a single-system empirical case study of iterative, agent-driven auditing applied to AEGIS (Autonomous Engineering Governance and Intelligence System), a production seven-lane orchestration pipeline whose prompt-specification surface comprises approximately 7150 lines: 6907 across seven lane PROMPT.md files and a 245-line shared Ticket Contract. Nine sequential audit rounds, executed by Claude sub-agents using a checklist-driven walkthrough adapted from Weinberg and Freedman, surfaced 51 prompt-specification consistency defects, distinct from the 51 STRIDE-categorized adversarial code findings reported in the companion preprint. Per-round counts were 15, 8, 12, 2, 8, 1, 4, 1, and 0. We report a seven-category post-hoc defect taxonomy with explicit coding rules, observed non-monotonic convergence consistent with cascading edits and audit-scope expansion, and an audit protocol distilled from the study, with the final locked checklist released as a reproducibility appendix. Single-file review missed defect classes that were surfaced only by later expanded-scope rounds in this system. The same LLM family authored and audited the specifications; replication with dissimilar models and human reviewers is required before generalization.",
          "authors": [
            "Elias Calboreanu"
          ],
          "categories": [
            "cs.SE",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12280v1",
          "abstract_url": "https://arxiv.org/abs/2605.12280v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12280v1",
          "published_at": "2026-05-12T15:39:04+00:00",
          "updated_at": "2026-05-12T15:39:04+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12280",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12280v1"
          },
          "relevance_score": 144,
          "match_reasons": [
            "title matched \"LLM\"",
            "title matched \"agent\"",
            "summary matched \"language model\"",
            "summary matched \"large language model\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12280"
        },
        {
          "title": "Instruction Lens Score: Your Instruction Contributes a Powerful Object Hallucination Detector for Multimodal Large Language Models",
          "summary": "Multimodal large language models (MLLMs) have achieved remarkable progress, yet the object hallucination remains a critical challenge for reliable deployment. In this paper, we present an in-depth analysis of instruction token embeddings and reveal that they implicitly encode visual information while effectively filtering erroneous information introduced by misleading visual embeddings. Building on this insight, we propose the Instruction Lens Score (InsLen), which combines a Calibrated Local Score with a Context Consistency Score that measures context consistency of the object tokens. The proposed approach serves as a plug-and-play object hallucination detector without relying on auxiliary models or additional training. Extensive experiments across multiple benchmarks and diverse MLLM architectures demonstrate that InsLen consistently outperforms existing hallucination detection methods, highlighting its effectiveness and robustness. The code is available at https://github.com/Fraserlairh/Instruction-Lens-Score.",
          "authors": [
            "Runhe Lai",
            "Xinhua Lu",
            "Yanqi Wu",
            "Jinlun Ye",
            "Weijiang Yu",
            "Ruixuan Wang"
          ],
          "categories": [
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12258v1",
          "abstract_url": "https://arxiv.org/abs/2605.12258v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12258v1",
          "published_at": "2026-05-12T15:27:40+00:00",
          "updated_at": "2026-05-12T15:27:40+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "LLM"
          ],
          "doi": null,
          "arxiv_id": "2605.12258",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12258v1"
          },
          "relevance_score": 144,
          "match_reasons": [
            "title matched \"language model\"",
            "title matched \"large language model\"",
            "summary matched \"LLM\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12258"
        }
      ]
    },
    {
      "name": "Agent Runtime Security",
      "key_points": [
        "《Metaphor Is Not All Attention Needs》〔应用 / 方法〕：Large language models are increasingly deployed in safety-critical applications, where their ability to resist harmful instructions is essential. Although post…",
        "《A microservices-based endpoint monitoring platform with predictive NLP models for real-time security and hate-speech risk alerting》〔方法〕：Organizations increasingly depend on endpoint devices and corporate communication channels, yet they still face critical risks such as sensitive data leakage,…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "Metaphor Is Not All Attention Needs",
          "summary": "Large language models are increasingly deployed in safety-critical applications, where their ability to resist harmful instructions is essential. Although post-training aims to make models robust against many jailbreak strategies, recent evidence shows that stylistic reformulations, such as poetic transformation, can still bypass safety mechanisms with alarming effectiveness. This raises a central question: why do literary jailbreaks succeed? In this work, we investigate whether their effectiveness depends on specific poetic devices, on a failure to recognize literary formatting, or on deeper changes in how models process stylistically irregular prompts. We address this problem through an interpretability analysis of attention patterns. We perform input-level ablation studies to assess the contribution of individual and combinations of poetic devices; construct an interpretable vector representation of attention maps; cluster these representations and train linear probes to predict safety outcomes and literary format. Our results show that models distinguish poetic from prose formats with high accuracy, yet struggle to predict jailbreak success within each format. Clustering further reveals clear separation by literary format, but not by safety label. These findings indicate that jailbreak success is not caused by a failure to recognize poetic formatting; rather, poetic prompts induce distinct processing patterns that remain largely independent of harmful-content detection. Overall, literary jailbreaks appear to misalign large language models not through any single poetic device, but through accumulated stylistic irregularities that alter prompt processing and avoid lexical triggers considered during post-training. This suggests that robustness requires safety mechanisms that account for style-induced shifts in model behavior. We use Qwen3-14B as a representative open-weight case study.",
          "authors": [
            "Olga Sorokoletova",
            "Francesco Giarrusso",
            "Giacomo De Luca",
            "Piercosma Bisconti",
            "Matteo Prandi",
            "Federico Pierucci",
            "Marcello Galisai",
            "Vincenzo Suriani",
            "Daniele Nardi"
          ],
          "categories": [
            "cs.CL",
            "cs.CY"
          ],
          "paper_id": "http://arxiv.org/abs/2605.12128v1",
          "abstract_url": "https://arxiv.org/abs/2605.12128v1",
          "pdf_url": "https://arxiv.org/pdf/2605.12128v1",
          "published_at": "2026-05-12T13:50:26+00:00",
          "updated_at": "2026-05-12T13:50:26+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "Large Language Model"
          ],
          "doi": null,
          "arxiv_id": "2605.12128",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.12128v1"
          },
          "relevance_score": 44,
          "match_reasons": [
            "summary matched \"jailbreak\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.12128"
        },
        {
          "title": "A microservices-based endpoint monitoring platform with predictive NLP models for real-time security and hate-speech risk alerting",
          "summary": "Organizations increasingly depend on endpoint devices and corporate communication channels, yet they still face critical risks such as sensitive data leakage, suspicious user behavior, and the circulation of hateful or harmful language in workplace contexts. Current solutions frequently address these issues in isolation (e.g., productivity tracking, data loss prevention, or hate-speech detection), limiting correlation across signals and delaying incident response. This work proposes a unified, microservices-based platform that collects endpoint telemetry and applies predictive natural language processing models to support real-time security and compliance alerting. The architecture is modular and scalable, relying on RabbitMQ for event ingestion and routing and Redis for low-latency data access and alert delivery. For text classification, transformer-based models such as BERT are evaluated for hate-speech risk detection, achieving an average accuracy of 87\\%. Experimental results indicate that the proposed platform can promptly surface indicators of data exfiltration and policy violations while centralizing alert management, providing an integrated framework that combines monitoring, security analytics, and predictive capabilities.",
          "authors": [
            "Darlan Noetzold",
            "Anubis Graciela De Moraes Rossetto",
            "Juan Francisco De Paz Santana",
            "Valderi Reis Quietinho Leithard"
          ],
          "categories": [
            "cs.CR"
          ],
          "paper_id": "http://arxiv.org/abs/2605.11997v1",
          "abstract_url": "https://arxiv.org/abs/2605.11997v1",
          "pdf_url": "https://arxiv.org/pdf/2605.11997v1",
          "published_at": "2026-05-12T11:46:46+00:00",
          "updated_at": "2026-05-12T11:46:46+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "方法"
          ],
          "topics": [
            "RAG",
            "DATA Exfiltration"
          ],
          "doi": null,
          "arxiv_id": "2605.11997",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2605.11997v1"
          },
          "relevance_score": 42,
          "match_reasons": [
            "summary matched \"data exfiltration\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2605.11997"
        }
      ]
    }
  ]
}