{
  "generated_at": "2026-04-16T11:43:00.212360+08:00",
  "timezone": "Asia/Shanghai",
  "lookback_hours": 24,
  "sorting": {
    "default_sort_by": "hybrid",
    "summary": "hybrid (relevance first, published_at tie-break)",
    "weights": {
      "title_match_weight": 40,
      "summary_match_weight": 18,
      "doi_weight": 12,
      "pdf_weight": 8,
      "rich_summary_weight": 6,
      "metadata_weight": 4,
      "multi_source_weight": 10,
      "freshness_weight_cap": 24
    },
    "feeds": [
      {
        "name": "LLM",
        "sort_by": "hybrid"
      },
      {
        "name": "Vision",
        "sort_by": "hybrid"
      },
      {
        "name": "PubMed AI",
        "sort_by": "hybrid"
      },
      {
        "name": "OpenAlex AI",
        "sort_by": "hybrid"
      }
    ]
  },
  "highlights": [
    "主题「Benchmark」：命中 18 篇，覆盖 LLM、Vision 等，代表论文包括 《GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis》、《HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark》。",
    "主题「Reasoning」：命中 17 篇，覆盖 LLM、Vision 等，代表论文包括 《GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis》、《Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning》。",
    "主题「Language Model」：命中 7 篇，覆盖 LLM、Vision 等，代表论文包括 《Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning》、《TREX: Automating LLM Fine-tuning via Agent-Driven Tree-based Exploration》。"
  ],
  "focus_items": [],
  "action_items": [],
  "topic_sections": [
    {
      "name": "Benchmark",
      "paper_count": 18,
      "feed_names": [
        "LLM",
        "Vision",
        "PubMed AI"
      ],
      "paper_titles": [
        "GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis",
        "HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark",
        "LongCoT: Benchmarking Long-Horizon Chain-of-Thought Reasoning",
        "Correct Prediction, Wrong Steps? Consensus Reasoning Knowledge Graph for Robust Chain-of-Thought Synthesis",
        "TREX: Automating LLM Fine-tuning via Agent-Driven Tree-based Exploration",
        "MAny: Merge Anything for Multimodal Continual Instruction Tuning",
        "MedRCube: A Multidimensional Framework for Fine-Grained and In-Depth Evaluation of MLLMs in Medical Imaging",
        "Doc-V*:Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA",
        "Memory Transfer Learning: How Memories are Transferred Across Domains in Coding Agents",
        "Reward Design for Physical Reasoning in Vision-Language Models",
        "Who Gets Flagged? The Pluralistic Evaluation Gap in AI Content Watermarking",
        "ROSE: Retrieval-Oriented Segmentation Enhancement",
        "Decoding the Delta: Unifying Remote Sensing Change Detection and Understanding with Multimodal Large Language Models",
        "POINTS-Seeker: Towards Training a Multimodal Agentic Search Model from Scratch",
        "PBE-UNet: A light weight Progressive Boundary-Enhanced U-Net with Scale-Aware Aggregation for Ultrasound Image Segmentation",
        "Augmenting Large Language Model With Prompt Engineering and Supervised Fine-Tuning in Non-Small Cell Lung Cancer Tumor-Node-Metastasis Staging: Framework Development and Validation.",
        "PKFAR: psychiatry knowledge-fused augmented reasoning with large language models.",
        "A Multi-AI Agent Framework for Interactive Neurosurgical Education and Evaluation: From Vignettes to Virtual Conversations."
      ],
      "key_points": [
        "《GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis》〔评测 / 应用 / 方法〕：The integration of Large Language Models (LLMs) into Geographic Information Systems (GIS) marks a paradigm shift toward autonomous spatial analysis. However, e…",
        "《HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark》〔评测 / 数据 / 方法〕：Existing agent-safety evaluation has focused mainly on externally induced risks. Yet agents may still enter unsafe trajectories under benign conditions. We stu…"
      ]
    },
    {
      "name": "Reasoning",
      "paper_count": 17,
      "feed_names": [
        "LLM",
        "Vision",
        "PubMed AI"
      ],
      "paper_titles": [
        "GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis",
        "Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning",
        "LongCoT: Benchmarking Long-Horizon Chain-of-Thought Reasoning",
        "Correct Prediction, Wrong Steps? Consensus Reasoning Knowledge Graph for Robust Chain-of-Thought Synthesis",
        "The cognitive companion: a lightweight parallel monitoring architecture for detecting and recovering from reasoning degradation in LLM agents",
        "MUSE: Multi-Domain Chinese User Simulation via Self-Evolving Profiles and Rubric-Guided Alignment",
        "MAny: Merge Anything for Multimodal Continual Instruction Tuning",
        "MedRCube: A Multidimensional Framework for Fine-Grained and In-Depth Evaluation of MLLMs in Medical Imaging",
        "Doc-V*:Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA",
        "Reward Design for Physical Reasoning in Vision-Language Models",
        "ToolOmni: Enabling Open-World Tool Use via Agentic learning with Proactive Retrieval and Grounded Execution",
        "Decoding the Delta: Unifying Remote Sensing Change Detection and Understanding with Multimodal Large Language Models",
        "Free Lunch for Unified Multimodal Models: Enhancing Generation via Reflective Rectification with Inherent Understanding",
        "POINTS-Seeker: Towards Training a Multimodal Agentic Search Model from Scratch",
        "Augmenting Large Language Model With Prompt Engineering and Supervised Fine-Tuning in Non-Small Cell Lung Cancer Tumor-Node-Metastasis Staging: Framework Development and Validation.",
        "PKFAR: psychiatry knowledge-fused augmented reasoning with large language models.",
        "A Multi-AI Agent Framework for Interactive Neurosurgical Education and Evaluation: From Vignettes to Virtual Conversations."
      ],
      "key_points": [
        "《GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis》〔评测 / 应用 / 方法〕：The integration of Large Language Models (LLMs) into Geographic Information Systems (GIS) marks a paradigm shift toward autonomous spatial analysis. However, e…",
        "《Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning》〔评测 / 数据 / 方法〕：The rapid evolution of multimodal large models has revolutionized the simulation of diverse characters in speech dialogue systems, enabling a novel interactive…"
      ]
    },
    {
      "name": "Language Model",
      "paper_count": 7,
      "feed_names": [
        "LLM",
        "Vision",
        "PubMed AI"
      ],
      "paper_titles": [
        "Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning",
        "TREX: Automating LLM Fine-tuning via Agent-Driven Tree-based Exploration",
        "The cognitive companion: a lightweight parallel monitoring architecture for detecting and recovering from reasoning degradation in LLM agents",
        "ToolOmni: Enabling Open-World Tool Use via Agentic learning with Proactive Retrieval and Grounded Execution",
        "ROSE: Retrieval-Oriented Segmentation Enhancement",
        "Fact-Checking Large Language Model Responses to a Health Care Prompt: Comparative Study.",
        "Fine-Tuned Large Language Models for Automated Radiology Impression Generation: A Multicenter Evaluation."
      ],
      "key_points": [
        "《Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning》〔评测 / 数据 / 方法〕：The rapid evolution of multimodal large models has revolutionized the simulation of diverse characters in speech dialogue systems, enabling a novel interactive…",
        "《TREX: Automating LLM Fine-tuning via Agent-Driven Tree-based Exploration》〔评测 / 应用 / 方法〕：While Large Language Models (LLMs) have empowered AI research agents to perform isolated scientific tasks, automating complex, real-world workflows, such as LL…"
      ]
    },
    {
      "name": "Evaluation",
      "paper_count": 6,
      "feed_names": [
        "LLM",
        "Vision",
        "PubMed AI"
      ],
      "paper_titles": [
        "HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark",
        "MUSE: Multi-Domain Chinese User Simulation via Self-Evolving Profiles and Rubric-Guided Alignment",
        "Who Gets Flagged? The Pluralistic Evaluation Gap in AI Content Watermarking",
        "Seedance 2.0: Advancing Video Generation for World Complexity",
        "Fact-Checking Large Language Model Responses to a Health Care Prompt: Comparative Study.",
        "Fine-Tuned Large Language Models for Automated Radiology Impression Generation: A Multicenter Evaluation."
      ],
      "key_points": [
        "《HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark》〔评测 / 数据 / 方法〕：Existing agent-safety evaluation has focused mainly on externally induced risks. Yet agents may still enter unsafe trajectories under benign conditions. We stu…",
        "《MUSE: Multi-Domain Chinese User Simulation via Self-Evolving Profiles and Rubric-Guided Alignment》〔评测 / 方法〕：User simulators are essential for the scalable training and evaluation of interactive AI systems. However, existing approaches often rely on shallow user profi…"
      ]
    },
    {
      "name": "Diffusion",
      "paper_count": 3,
      "feed_names": [
        "Vision"
      ],
      "paper_titles": [
        "DiT as Real-Time Rerenderer: Streaming Video Stylization with Autoregressive Diffusion Transformer",
        "Remote Sensing Image Super-Resolution for Imbalanced Textures: A Texture-Aware Diffusion Framework",
        "Blind Bitstream-corrupted Video Recovery via Metadata-guided Diffusion Model"
      ],
      "key_points": [
        "《DiT as Real-Time Rerenderer: Streaming Video Stylization with Autoregressive Diffusion Transformer》〔评测 / 数据 / 应用 / 方法〕：Recent advances in video generation models has significantly accelerated video generation and related downstream tasks. Among these, video stylization holds im…",
        "《Remote Sensing Image Super-Resolution for Imbalanced Textures: A Texture-Aware Diffusion Framework》〔评测 / 应用 / 方法〕：Generative diffusion priors have recently achieved state-of-the-art performance in natural image super-resolution, demonstrating a powerful capability to synth…"
      ]
    }
  ],
  "template": "zh_daily_brief",
  "feeds": [
    {
      "name": "LLM",
      "key_points": [
        "《GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis》〔评测 / 应用 / 方法〕：The integration of Large Language Models (LLMs) into Geographic Information Systems (GIS) marks a paradigm shift toward autonomous spatial analysis. However, e…",
        "《HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark》〔评测 / 数据 / 方法〕：Existing agent-safety evaluation has focused mainly on externally induced risks. Yet agents may still enter unsafe trajectories under benign conditions. We stu…",
        "《Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning》〔评测 / 数据 / 方法〕：The rapid evolution of multimodal large models has revolutionized the simulation of diverse characters in speech dialogue systems, enabling a novel interactive…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis",
          "summary": "The integration of Large Language Models (LLMs) into Geographic Information Systems (GIS) marks a paradigm shift toward autonomous spatial analysis. However, evaluating these LLM-based agents remains challenging due to the complex, multi-step nature of geospatial workflows. Existing benchmarks primarily rely on static text or code matching, neglecting dynamic runtime feedback and the multimodal nature of spatial outputs. To address this gap, we introduce GeoAgentBench (GABench), a dynamic and interactive evaluation benchmark tailored for tool-augmented GIS agents. GABench provides a realistic execution sandbox integrating 117 atomic GIS tools, encompassing 53 typical spatial analysis tasks across 6 core GIS domains. Recognizing that precise parameter configuration is the primary determinant of execution success in dynamic GIS environments, we designed the Parameter Execution Accuracy (PEA) metric, which utilizes a \"Last-Attempt Alignment\" strategy to quantify the fidelity of implicit parameter inference. Complementing this, a Vision-Language Model (VLM) based verification is proposed to assess data-spatial accuracy and cartographic style adherence. Furthermore, to address the frequent task failures caused by parameter misalignments and runtime anomalies, we developed a novel agent architecture, Plan-and-React, that mimics expert cognitive workflows by decoupling global orchestration from step-wise reactive execution. Extensive experiments with seven representative LLMs demonstrate that the Plan-and-React paradigm significantly outperforms traditional frameworks, achieving the optimal balance between logical rigor and execution robustness, particularly in multi-step reasoning and error recovery. Our findings highlight current capability boundaries and establish a robust standard for assessing and advancing the next generation of autonomous GeoAI.",
          "authors": [
            "Bo Yu",
            "Cheng Yang",
            "Dongyang Hou",
            "Chengfu Liu",
            "Jiayao Liu",
            "Chi Wang",
            "Zhiming Zhang",
            "Haifeng Li",
            "Wentao Yang"
          ],
          "categories": [
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13888v1",
          "abstract_url": "https://arxiv.org/abs/2604.13888v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13888v1",
          "published_at": "2026-04-15T13:55:34+00:00",
          "updated_at": "2026-04-15T13:55:34+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.13888",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13888v1"
          },
          "relevance_score": 162,
          "match_reasons": [
            "title matched \"agent\"",
            "title matched \"benchmark\"",
            "summary matched \"reasoning\"",
            "summary matched \"alignment\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13888"
        },
        {
          "title": "HINTBench: Horizon-agent Intrinsic Non-attack Trajectory Benchmark",
          "summary": "Existing agent-safety evaluation has focused mainly on externally induced risks. Yet agents may still enter unsafe trajectories under benign conditions. We study this complementary but underexplored setting through the lens of \\emph{intrinsic} risk, where intrinsic failures remain latent, propagate across long-horizon execution, and eventually lead to high-consequence outcomes. To evaluate this setting, we introduce \\emph{non-attack intrinsic risk auditing} and present \\textbf{HINTBench}, a benchmark of 629 agent trajectories (523 risky, 106 safe; 33 steps on average) supporting three tasks: risk detection, risk-step localization, and intrinsic failure-type identification. Its annotations are organized under a unified five-constraint taxonomy. Experiments reveal a substantial capability gap: strong LLMs perform well on trajectory-level risk detection, but their performance drops to below 35 Strict-F1 on risk-step localization, while fine-grained failure diagnosis proves even harder. Existing guard models transfer poorly to this setting. These findings establish intrinsic risk auditing as an open challenge for agent safety.",
          "authors": [
            "Jiacheng Wang",
            "Jinchang Hou",
            "Fabian Wang",
            "Ping Jian",
            "Chenfu Bao",
            "Zhonghou Lv"
          ],
          "categories": [
            "cs.LG",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13954v1",
          "abstract_url": "https://arxiv.org/abs/2604.13954v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13954v1",
          "published_at": "2026-04-15T15:06:01+00:00",
          "updated_at": "2026-04-15T15:06:01+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Evaluation"
          ],
          "doi": null,
          "arxiv_id": "2604.13954",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13954v1"
          },
          "relevance_score": 127,
          "match_reasons": [
            "title matched \"agent\"",
            "title matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13954"
        },
        {
          "title": "Character Beyond Speech: Leveraging Role-Playing Evaluation in Audio Large Language Models via Reinforcement Learning",
          "summary": "The rapid evolution of multimodal large models has revolutionized the simulation of diverse characters in speech dialogue systems, enabling a novel interactive paradigm. Character attributes are manifested not only in textual responses but also through vocal features, as speech conveys rich paralinguistic information that is challenging to quantify. This poses significant difficulties in evaluating the character alignment of role-playing agents. To address these challenges, we present RoleJudge, an evaluation framework that leverages audio large language models to systematically assess the alignment between speech and character across multiple modalities and dimensions. Furthermore, we introduce RoleChat, the first voice role-playing evaluation dataset enriched with chain-of-thought reasoning annotations, comprising a diverse set of authentic and LLM-generated speech samples. Utilizing this dataset, we implement a multi-stage training paradigm and incorporate Standard Alignment in reinforcement learning to mitigate reward misalignment during optimization. Experimental results in terms of accuracy and subjective assessment demonstrate that RoleJudge outperforms various baseline models, validating the effectiveness of our multidimensional evaluation framework.",
          "authors": [
            "Dongjie Fu",
            "Fangming Feng",
            "Xize Cheng",
            "Linjun Li",
            "Zhou Zhao",
            "Tao Jin"
          ],
          "categories": [
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13804v1",
          "abstract_url": "https://arxiv.org/abs/2604.13804v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13804v1",
          "published_at": "2026-04-15T12:39:03+00:00",
          "updated_at": "2026-04-15T12:39:03+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Reasoning",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2604.13804",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13804v1"
          },
          "relevance_score": 120,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"agent\"",
            "summary matched \"reasoning\"",
            "summary matched \"alignment\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13804"
        },
        {
          "title": "LongCoT: Benchmarking Long-Horizon Chain-of-Thought Reasoning",
          "summary": "As language models are increasingly deployed for complex autonomous tasks, their ability to reason accurately over longer horizons becomes critical. An essential component of this ability is planning and managing a long, complex chain-of-thought (CoT). We introduce LongCoT, a scalable benchmark of 2,500 expert-designed problems spanning chemistry, mathematics, computer science, chess, and logic to isolate and directly measure the long-horizon CoT reasoning capabilities of frontier models. Problems consist of a short input with a verifiable answer; solving them requires navigating a graph of interdependent steps that span tens to hundreds of thousands of reasoning tokens. Each local step is individually tractable for frontier models, so failures reflect long-horizon reasoning limitations. At release, the best models achieve <10% accuracy (GPT 5.2: 9.8%; Gemini 3 Pro: 6.1%) on LongCoT, revealing a substantial gap in current capabilities. Overall, LongCoT provides a rigorous measure of long-horizon reasoning, tracking the ability of frontier models to reason reliably over extended periods.",
          "authors": [
            "Sumeet Ramesh Motwani",
            "Daniel Nichols",
            "Charles London",
            "Peggy Li",
            "Fabio Pizzati",
            "Acer Blake",
            "Hasan Hammoud",
            "Tavish McDonald",
            "Akshat Naik",
            "Alesia Ivanova",
            "Vignesh Baskaran",
            "Ivan Laptev",
            "Ruben Glatt",
            "Tal Ben-Nun",
            "Philip Torr",
            "Natasha Jaques",
            "Ameya Prabhu",
            "Brian Bartoldson",
            "Bhavya Kailkhura",
            "Christian Schroeder de Witt"
          ],
          "categories": [
            "cs.LG",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14140v1",
          "abstract_url": "https://arxiv.org/abs/2604.14140v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14140v1",
          "published_at": "2026-04-15T17:58:05+00:00",
          "updated_at": "2026-04-15T17:58:05+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.14140",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14140v1"
          },
          "relevance_score": 112,
          "match_reasons": [
            "title matched \"reasoning\"",
            "title matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14140"
        },
        {
          "title": "Correct Prediction, Wrong Steps? Consensus Reasoning Knowledge Graph for Robust Chain-of-Thought Synthesis",
          "summary": "LLM reasoning traces suffer from complex flaws -- *Step Internal Flaws* (logical errors, hallucinations, etc.) and *Step-wise Flaws* (overthinking, underthinking), which vary by sample. A natural approach would be to provide ground-truth labels to guide LLMs' reasoning. Contrary to intuition, we show that this yields no improvement in reasoning ability. We then propose CRAFT, a unified framework that mitigates both types of Step flaws, which builds a Reasoning Knowledge Graph (RKG) based on the consensus parts of multiple candidate traces, and synthesizes a high-quality trace through topological generation. Our approach improves label-prediction accuracy by 10+% on average, and consistently outperforms all baselines across both logical and mathematical reasoning benchmarks. Further, detailed benchmark evaluation proves that our method also improves the quality of LLMs' reasoning traces in multiple dimensions.",
          "authors": [
            "Zipeng Ling",
            "Shuliang Liu",
            "Shenghong Fu",
            "Yuehao Tang",
            "Seonil Son",
            "Yao Wan",
            "Xuming Hu"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14121v1",
          "abstract_url": "https://arxiv.org/abs/2604.14121v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14121v1",
          "published_at": "2026-04-15T17:43:10+00:00",
          "updated_at": "2026-04-15T17:43:10+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.14121",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14121v1"
          },
          "relevance_score": 108,
          "match_reasons": [
            "title matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14121"
        },
        {
          "title": "TREX: Automating LLM Fine-tuning via Agent-Driven Tree-based Exploration",
          "summary": "While Large Language Models (LLMs) have empowered AI research agents to perform isolated scientific tasks, automating complex, real-world workflows, such as LLM training, remains a significant challenge. In this paper, we introduce TREX, a multi-agent system that automates the entire LLM training life-cycle. By orchestrating collaboration between two core modules-the Researcher and the Executor-the system seamlessly performs requirement analysis, open-domain literature and data research, formulation of training strategies, preparation of data recipes, and model training and evaluation. The multi-round experimental process is modeled as a search tree, enabling the system to efficiently plan exploration paths, reuse historical results, and distill high-level insights from iterative trials. To evaluate the capability of automated LLM training, we construct FT-Bench, a benchmark comprising 10 tasks derived from real-world scenarios, ranging from optimizing fundamental model capabilities to enhancing performance on domain-specific tasks. Experimental results demonstrate that the TREX agent consistently optimizes model performance on target tasks.",
          "authors": [
            "Zerun Ma",
            "Guoqiang Wang",
            "Xinchen Xie",
            "Yicheng Chen",
            "He Du",
            "Bowen Li",
            "Yanan Sun",
            "Wenran Liu",
            "Kai Chen",
            "Yining Li"
          ],
          "categories": [
            "cs.AI",
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14116v1",
          "abstract_url": "https://arxiv.org/abs/2604.14116v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14116v1",
          "published_at": "2026-04-15T17:38:06+00:00",
          "updated_at": "2026-04-15T17:38:06+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2604.14116",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14116v1"
          },
          "relevance_score": 107,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14116"
        },
        {
          "title": "The cognitive companion: a lightweight parallel monitoring architecture for detecting and recovering from reasoning degradation in LLM agents",
          "summary": "Large language model (LLM) agents on multi-step tasks suffer reasoning degradation, looping, drift, stuck states, at rates up to 30% on hard tasks. Current solutions include hard step limits (abrupt) or LLM-as-judge monitoring (10-15% overhead per step). This paper introduces the Cognitive Companion, a parallel monitoring architecture with two implementations: an LLM-based Companion and a novel zero-overhead Probe-based Companion. We report a three-batch feasibility study centered on Gemma 4 E4B, with an additional exploratory small-model analysis on Qwen 2.5 1.5B and Llama 3.2 1B. In our experiments, the LLM-based Companion reduced repetition on loop-prone tasks by 52-62% with approximately 11% overhead. The Probe-based Companion, trained on hidden states from layer 28, showed a mean effect size of +0.471 at zero measured inference overhead; its strongest probe result achieved cross-validated AUROC 0.840 on a small proxy-labeled dataset. A key empirical finding is that companion benefit appears task-type dependent: companions are most helpful on loop-prone and open-ended tasks, while effects are neutral or negative on more structured tasks. Our small-model experiments also suggest a possible scale boundary: companions did not improve the measured quality proxy on 1B-1.5B models, even when interventions fired. Overall, the paper should be read as a feasibility study rather than a definitive validation. The results provide encouraging evidence that sub-token monitoring may be useful, identify task-type sensitivity as a practical design constraint, and motivate selective companion activation as a promising direction for future work.",
          "authors": [
            "Rafflesia Khan",
            "Nafiul Islam Khan"
          ],
          "categories": [
            "cs.AI",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13759v1",
          "abstract_url": "https://arxiv.org/abs/2604.13759v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13759v1",
          "published_at": "2026-04-15T11:44:20+00:00",
          "updated_at": "2026-04-15T11:44:20+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "数据",
            "方法"
          ],
          "topics": [
            "Reasoning",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2604.13759",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13759v1"
          },
          "relevance_score": 106,
          "match_reasons": [
            "title matched \"agent\"",
            "title matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13759"
        },
        {
          "title": "MUSE: Multi-Domain Chinese User Simulation via Self-Evolving Profiles and Rubric-Guided Alignment",
          "summary": "User simulators are essential for the scalable training and evaluation of interactive AI systems. However, existing approaches often rely on shallow user profiling, struggle to maintain persona consistency over long interactions, and are largely limited to English or single-domain settings. We present MUSE, a multi-domain Chinese user simulation framework designed to generate human-like, controllable, and behaviorally consistent responses. First, we propose Iterative Profile Self-Evolution (IPSE), which gradually optimizes user profiles by comparing and reasoning discrepancies between simulated trajectories and real dialogue behaviors. We then apply Role-Reversal Supervised Fine-Tuning to improve local response realism and human-like expression. To enable fine-grained behavioral alignment, we further train a specialized rubric-based reward model and incorporate it into rubric-guided multi-turn reinforcement learning, which optimizes the simulator at the dialogue level and enhances long-horizon behavioral consistency. Experiments show that MUSE consistently outperforms strong baselines in both utterance-level and session-level evaluations, generating responses that are more realistic, coherent, and persona-consistent over extended interactions.",
          "authors": [
            "Zihao Liu",
            "Hantao Zhou",
            "Jiguo Li",
            "Jun Xu",
            "Jiuchong Gao",
            "Jinghua Hao",
            "Renqing He",
            "Peng Wang"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13828v1",
          "abstract_url": "https://arxiv.org/abs/2604.13828v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13828v1",
          "published_at": "2026-04-15T13:01:00+00:00",
          "updated_at": "2026-04-15T13:01:00+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Reasoning",
            "Evaluation"
          ],
          "doi": null,
          "arxiv_id": "2604.13828",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13828v1"
          },
          "relevance_score": 103,
          "match_reasons": [
            "title matched \"alignment\"",
            "summary matched \"reasoning\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13828"
        },
        {
          "title": "MAny: Merge Anything for Multimodal Continual Instruction Tuning",
          "summary": "Multimodal Continual Instruction Tuning (MCIT) is essential for sequential task adaptation of Multimodal Large Language Models (MLLMs) but is severely restricted by catastrophic forgetting. While existing literature focuses on the reasoning language backbone, in this work, we expose a critical yet neglected dual-forgetting phenomenon across both perception drift in Cross-modal Projection Space and reasoning collapse in Low-rank Parameter Space. To resolve this, we present \\textbf{MAny} (\\textbf{M}erge \\textbf{Any}thing), a framework that merges task-specific knowledge through \\textbf{C}ross-modal \\textbf{P}rojection \\textbf{M}erging (\\textbf{CPM}) and \\textbf{L}ow-rank \\textbf{P}arameter \\textbf{M}erging (\\textbf{LPM}). Specifically, CPM recovers perceptual alignment by adaptively merging cross-modal visual representations via visual-prototype guidance, ensuring accurate feature recovery during inference. Simultaneously, LPM eliminates mutual interference among task-specific low-rank modules by recursively merging low-rank weight matrices. By leveraging recursive least squares, LPM provides a closed-form solution that mathematically guarantees an optimal fusion trajectory for reasoning stability. Notably, MAny operates as a training-free paradigm that achieves knowledge merging via efficient CPU-based algebraic operations, eliminating additional gradient-based optimization beyond initial tuning. Our extensive evaluations confirm the superior performance and robustness of MAny across multiple MLLMs and benchmarks. Specifically, on the UCIT benchmark, MAny achieves significant leads of up to 8.57\\% and 2.85\\% in final average accuracy over state-of-the-art methods across two different MLLMs, respectively.",
          "authors": [
            "Zijian Gao",
            "Wangwang Jia",
            "Xingxing Zhang",
            "Pengfei Qian",
            "Tao Sun",
            "Bo Ding",
            "Yong Dou",
            "Huaimin Wang",
            "Kele Xu"
          ],
          "categories": [
            "cs.LG",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14016v1",
          "abstract_url": "https://arxiv.org/abs/2604.14016v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14016v1",
          "published_at": "2026-04-15T15:57:23+00:00",
          "updated_at": "2026-04-15T15:57:23+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.14016",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14016v1"
          },
          "relevance_score": 102,
          "match_reasons": [
            "summary matched \"reasoning\"",
            "summary matched \"alignment\"",
            "summary matched \"benchmark\"",
            "summary matched \"evaluation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14016"
        },
        {
          "title": "MedRCube: A Multidimensional Framework for Fine-Grained and In-Depth Evaluation of MLLMs in Medical Imaging",
          "summary": "The potential of Multimodal Large Language Models (MLLMs) in domain of medical imaging raise the demands of systematic and rigorous evaluation frameworks that are aligned with the real-world medical imaging practice. Existing practices that report single or coarse-grained metrics are lack the granularity required for specialized clinical support and fail to assess the reliability of reasoning mechanisms. To address this, we propose a paradigm shift toward multidimensional, fine-grained and in-depth evaluation. Based on a two-stage systematic construction pipeline designed for this paradigm, we instantiate it with MedRCube. We benchmark 33 MLLMs, \\textit{Lingshu-32B} achieve top-tier performance. Crucially, MedRCube exposes a series of pronounced insights inaccessible under prior evaluation settings. Furthermore, we introduce a credibility evaluation subset to quantify reasoning credibility, uncover a highly significant positive association between shortcut behavior and diagnostic task performance, raising concerns for clinically trustworthy deployment. The resources of this work can be found at https://github.com/F1mc/MedRCube.",
          "authors": [
            "Zhijie Bao",
            "Fangke Chen",
            "Licheng Bao",
            "Chenhui Zhang",
            "Wei Chen",
            "Jiajie Peng",
            "Zhongyu Wei"
          ],
          "categories": [
            "cs.CL",
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13756v1",
          "abstract_url": "https://arxiv.org/abs/2604.13756v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13756v1",
          "published_at": "2026-04-15T11:41:20+00:00",
          "updated_at": "2026-04-15T11:41:20+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.13756",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13756v1"
          },
          "relevance_score": 101,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13756"
        },
        {
          "title": "Doc-V*:Coarse-to-Fine Interactive Visual Reasoning for Multi-Page Document VQA",
          "summary": "Multi-page Document Visual Question Answering requires reasoning over semantics, layouts, and visual elements in long, visually dense documents. Existing OCR-free methods face a trade-off between capacity and precision: end-to-end models scale poorly with document length, while visual retrieval-based pipelines are brittle and passive. We propose Doc-$V^*$, an \\textbf{OCR-free agentic} framework that casts multi-page DocVQA as sequential evidence aggregation. Doc-$V^*$ begins with a thumbnail overview, then actively navigates via semantic retrieval and targeted page fetching, and aggregates evidence in a structured working memory for grounded reasoning. Trained by imitation learning from expert trajectories and further optimized with Group Relative Policy Optimization, Doc-$V^*$ balances answer accuracy with evidence-seeking efficiency. Across five benchmarks, Doc-$V^*$ outperforms open-source baselines and approaches proprietary models, improving out-of-domain performance by up to \\textbf{47.9\\%} over RAG baseline. Other results reveal effective evidence aggregation with selective attention, not increased input pages.",
          "authors": [
            "Yuanlei Zheng",
            "Pei Fu",
            "Hang Li",
            "Ziyang Wang",
            "Yuyi Zhang",
            "Wenyu Ruan",
            "Xiaojin Zhang",
            "Zhongyu Wei",
            "Zhenbo Luo",
            "Jian Luan",
            "Wei Chen",
            "Xiang Bai"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13731v1",
          "abstract_url": "https://arxiv.org/abs/2604.13731v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13731v1",
          "published_at": "2026-04-15T11:12:27+00:00",
          "updated_at": "2026-04-15T11:12:27+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.13731",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13731v1"
          },
          "relevance_score": 101,
          "match_reasons": [
            "title matched \"reasoning\"",
            "summary matched \"agent\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13731"
        },
        {
          "title": "Memory Transfer Learning: How Memories are Transferred Across Domains in Coding Agents",
          "summary": "Memory-based self-evolution has emerged as a promising paradigm for coding agents. However, existing approaches typically restrict memory utilization to homogeneous task domains, failing to leverage the shared infrastructural foundations, such as runtime environments and programming languages, that exist across diverse real-world coding problems. To address this limitation, we investigate \\textbf{Memory Transfer Learning} (MTL) by harnessing a unified memory pool from heterogeneous domains. We evaluate performance across 6 coding benchmarks using four memory representations, ranging from concrete traces to abstract insights. Our experiments demonstrate that cross-domain memory improves average performance by 3.7\\%, primarily by transferring meta-knowledge, such as validation routines, rather than task-specific code. Importantly, we find that abstraction dictates transferability; high-level insights generalize well, whereas low-level traces often induce negative transfer due to excessive specificity. Furthermore, we show that transfer effectiveness scales with the size of the memory pool, and memory can be transferred even between different models. Our work establishes empirical design principles for expanding memory utilization beyond single-domain silos. Project page: https://memorytransfer.github.io/",
          "authors": [
            "Kangsan Kim",
            "Minki Kang",
            "Taeil Kim",
            "Yanlai Yang",
            "Mengye Ren",
            "Sung Ju Hwang"
          ],
          "categories": [
            "cs.AI",
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14004v1",
          "abstract_url": "https://arxiv.org/abs/2604.14004v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14004v1",
          "published_at": "2026-04-15T15:50:29+00:00",
          "updated_at": "2026-04-15T15:50:29+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Agent"
          ],
          "doi": null,
          "arxiv_id": "2604.14004",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14004v1"
          },
          "relevance_score": 88,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14004"
        },
        {
          "title": "Reward Design for Physical Reasoning in Vision-Language Models",
          "summary": "Physical reasoning over visual inputs demands tight integration of visual perception, domain knowledge, and multi-step symbolic inference. Yet even state-of-the-art Vision Language Models (VLMs) fall far short of human performance on physics benchmarks. While post-training algorithms such as Supervised Fine-Tuning (SFT) and Group Relative Policy Optimization (GRPO) have demonstrated strong reasoning gains in language models, how reward design shapes VLM physical reasoning behavior remains poorly understood. We present a systematic reward ablation study for GRPO-based VLM training on physical reasoning. We compare four reward signals of increasing semantic richness: format compliance, answer accuracy, a composite rubric reward (answer correctness, physics principle identification, and unit consistency), and a novel internal reward derived from model attention weights over input image regions. We evaluate on PhyX, a 3,000-problem benchmark spanning six physics domains and six reasoning types across multiple-choice and open-ended formats, using IBM Granite Vision 3.3 (2B). Across both formats, GRPO with accuracy-based rewards outperforms SFT on most domains, though gains vary substantially by reward type and domain. Reward design does not uniformly improve performance. Instead, it induces domain-specific reasoning behaviors. Accuracy-based rewards provide the strongest overall gains. Rubric rewards improve structured reasoning quality without consistent accuracy improvements. Attention-based rewards enhance spatial reasoning while degrading performance in symbolic domains. Our internal attention-weight reward requires no spatial annotations and improves spatial relation accuracy from 0.27 to 0.50, suggesting that supervising where the model attends during generation is a promising direction for visually grounded physical reasoning.",
          "authors": [
            "Derek Lilienthal",
            "Manisha Mukherjee",
            "Sameera Horawalavithana"
          ],
          "categories": [
            "cs.AI",
            "cs.CL",
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13993v1",
          "abstract_url": "https://arxiv.org/abs/2604.13993v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13993v1",
          "published_at": "2026-04-15T15:36:26+00:00",
          "updated_at": "2026-04-15T15:36:26+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.13993",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13993v1"
          },
          "relevance_score": 87,
          "match_reasons": [
            "title matched \"reasoning\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13993"
        },
        {
          "title": "ToolOmni: Enabling Open-World Tool Use via Agentic learning with Proactive Retrieval and Grounded Execution",
          "summary": "Large Language Models (LLMs) enhance their problem-solving capability by utilizing external tools. However, in open-world scenarios with massive and evolving tool repositories, existing methods relying on static embedding retrieval or parameter memorization of tools struggle to align user intent with tool semantics or generalize to unseen tools, respectively, leading to suboptimal accuracy of open-world tool retrieval and execution. To address these, we present ToolOmni, a unified agentic framework that enables LLMs for open-world tool use by proactive retrieval and grounded execution within a reasoning loop. First, we construct a cold-start multi-turn interaction dataset to instill foundational agentic capabilities via Supervised Fine-Tuning (SFT). Then, we introduce open-world tool learning based on a Decoupled Multi-Objective GRPO algorithm, which simultaneously optimizes LLMs for both tool retrieval accuracy and execution efficacy in online environments. Extensive experiments demonstrate that ToolOmni achieves state-of-the-art performance both in retrieval and execution, surpassing strong baselines by a significant margin of +10.8% in end-to-end execution success rate, while exhibiting exceptional robustness and generalization capabilities.",
          "authors": [
            "Shouzheng Huang",
            "Meishan Zhang",
            "Baotian Hu",
            "Min Zhang"
          ],
          "categories": [
            "cs.CL"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13787v1",
          "abstract_url": "https://arxiv.org/abs/2604.13787v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13787v1",
          "published_at": "2026-04-15T12:26:10+00:00",
          "updated_at": "2026-04-15T12:26:10+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "数据",
            "方法"
          ],
          "topics": [
            "Reasoning",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2604.13787",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13787v1"
          },
          "relevance_score": 84,
          "match_reasons": [
            "title matched \"agent\"",
            "summary matched \"reasoning\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13787"
        },
        {
          "title": "Who Gets Flagged? The Pluralistic Evaluation Gap in AI Content Watermarking",
          "summary": "Watermarking is becoming the default mechanism for AI content authentication, with governance policies and frameworks referencing it as infrastructure for content provenance. Yet across text, image, and audio modalities, watermark signal strength, detectability, and robustness depend on statistical properties of the content itself, properties that vary systematically across languages, cultural visual traditions, and demographic groups. We examine how this content dependence creates modality-specific pathways to bias. Reviewing the major watermarking benchmarks across modalities, we find that, with one exception, none report performance across languages, cultural content types, or population groups. To address this, we propose three concrete evaluation dimensions for pluralistic watermark benchmarking: cross-lingual detection parity, culturally diverse content coverage, and demographic disaggregation of detection metrics. We connect these to the governance frameworks currently mandating watermarking deployment and show that watermarking is held to a lower fairness standard than the generative systems it is meant to govern. Our position is that evaluation must precede deployment, and that the same bias auditing requirements applied to AI models should extend to the verification layer.",
          "authors": [
            "Alexander Nemecek",
            "Osama Zafar",
            "Yuqiao Xu",
            "Wenbiao Li",
            "Erman Ayday"
          ],
          "categories": [
            "cs.CY",
            "cs.CL",
            "cs.CR",
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13776v1",
          "abstract_url": "https://arxiv.org/abs/2604.13776v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13776v1",
          "published_at": "2026-04-15T12:06:56+00:00",
          "updated_at": "2026-04-15T12:06:56+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Evaluation"
          ],
          "doi": null,
          "arxiv_id": "2604.13776",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13776v1"
          },
          "relevance_score": 84,
          "match_reasons": [
            "title matched \"evaluation\"",
            "summary matched \"benchmark\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13776"
        }
      ]
    },
    {
      "name": "Vision",
      "key_points": [
        "《ROSE: Retrieval-Oriented Segmentation Enhancement》〔评测 / 方法〕：Existing segmentation models based on multimodal large language models (MLLMs), such as LISA, often struggle with novel or emerging entities due to their inabi…",
        "《Decoding the Delta: Unifying Remote Sensing Change Detection and Understanding with Multimodal Large Language Models》〔评测 / 应用 / 方法〕：While Multimodal Large Language Models (MLLMs) excel in general vision-language tasks, their application to remote sensing change understanding is hindered by…",
        "《Free Lunch for Unified Multimodal Models: Enhancing Generation via Reflective Rectification with Inherent Understanding》〔方法〕：Unified Multimodal Models (UMMs) aim to integrate visual understanding and generation within a single structure. However, these models exhibit a notable capabi…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "ROSE: Retrieval-Oriented Segmentation Enhancement",
          "summary": "Existing segmentation models based on multimodal large language models (MLLMs), such as LISA, often struggle with novel or emerging entities due to their inability to incorporate up-to-date knowledge. To address this challenge, we introduce the Novel Emerging Segmentation Task (NEST), which focuses on segmenting (i) novel entities that MLLMs fail to recognize due to their absence from training data, and (ii) emerging entities that exist within the model's knowledge but demand up-to-date external information for accurate recognition. To support the study of NEST, we construct a NEST benchmark using an automated pipeline that generates news-related data samples for comprehensive evaluation. Additionally, we propose ROSE: Retrieval-Oriented Segmentation Enhancement, a plug-and-play framework designed to augment any MLLM-based segmentation model. ROSE comprises four key components. First, an Internet Retrieval-Augmented Generation module is introduced to employ user-provided multimodal inputs to retrieve real-time web information. Then, a Textual Prompt Enhancer enriches the model with up-to-date information and rich background knowledge, improving the model's perception ability for emerging entities. Furthermore, a Visual Prompt Enhancer is proposed to compensate for MLLMs' lack of exposure to novel entities by leveraging internet-sourced images. To maintain efficiency, a WebSense module is introduced to intelligently decide when to invoke retrieval mechanisms based on user input. Experimental results demonstrate that ROSE significantly boosts performance on the NEST benchmark, outperforming a strong Gemini-2.0 Flash-based retrieval baseline by 19.2 in gIoU.",
          "authors": [
            "Song Tang",
            "Guangquan Jie",
            "Henghui Ding",
            "Yu-Gang Jiang"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14147v1",
          "abstract_url": "https://arxiv.org/abs/2604.14147v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14147v1",
          "published_at": "2026-04-15T17:59:35+00:00",
          "updated_at": "2026-04-15T17:59:35+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Language Model"
          ],
          "doi": null,
          "arxiv_id": "2604.14147",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14147v1"
          },
          "relevance_score": 90,
          "match_reasons": [
            "title matched \"segmentation\"",
            "summary matched \"multimodal\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14147"
        },
        {
          "title": "Decoding the Delta: Unifying Remote Sensing Change Detection and Understanding with Multimodal Large Language Models",
          "summary": "While Multimodal Large Language Models (MLLMs) excel in general vision-language tasks, their application to remote sensing change understanding is hindered by a fundamental \"temporal blindness\". Existing architectures lack intrinsic mechanisms for multi-temporal contrastive reasoning and struggle with precise spatial grounding. To address this, we first introduce Delta-QA, a comprehensive benchmark comprising 180k visual question-answering samples. Delta-QA unifies pixel-level segmentation and visual question answering across bi- and tri-temporal scenarios, structuring change interpretation into four progressive cognitive dimensions. Methodologically, we propose Delta-LLaVA, a novel MLLM framework explicitly tailored for multi-temporal remote sensing interpretation. It overcomes the limitations of naive feature concatenation through three core innovations: a Change-Enhanced Attention module that systematically isolates and amplifies visual differences, a Change-SEG module utilizing Change Prior Embedding to extract differentiable difference features as input for the LLM, and Local Causal Attention to prevent cross-temporal contextual leakage. Extensive experiments demonstrate that Delta-LLaVA decisively outperforms leading generalist MLLMs and specialized segmentation models in complex change deduction and high-precision boundary localization, establishing a unified framework for earth observation intelligence.",
          "authors": [
            "Xiaohe Li",
            "Jiahao Li",
            "Kaixin Zhang",
            "Yuqiang Fang",
            "Leilei Lin",
            "Hong Wang",
            "Haohua Wu",
            "Zide Fan"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14044v1",
          "abstract_url": "https://arxiv.org/abs/2604.14044v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14044v1",
          "published_at": "2026-04-15T16:23:05+00:00",
          "updated_at": "2026-04-15T16:23:05+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.14044",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14044v1"
          },
          "relevance_score": 88,
          "match_reasons": [
            "title matched \"multimodal\"",
            "summary matched \"segmentation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14044"
        },
        {
          "title": "Free Lunch for Unified Multimodal Models: Enhancing Generation via Reflective Rectification with Inherent Understanding",
          "summary": "Unified Multimodal Models (UMMs) aim to integrate visual understanding and generation within a single structure. However, these models exhibit a notable capability mismatch, where their understanding capability significantly outperforms their generation. This mismatch indicates that the model's rich internal knowledge, while effective for understanding tasks, remains underactivated during generation. To address this, we draw inspiration from the human ``Thinking-While-Drawing'' paradigm, where humans continuously reflect to activate their knowledge and rectify intermediate results. In this paper, we propose UniRect-CoT, a training-free unified rectification chain-of-thought framework. Our approach unlocks the ``free lunch'' hidden in the UMM's powerful inherent understanding to continuously reflect, activating its internal knowledge and rectifying intermediate results during generation.We regard the diffusion denoising process in UMMs as an intrinsic visual reasoning process and align the intermediate results with the target instruction understood by the model, serving as a self-supervisory signal to rectify UMM generation.Extensive experiments demonstrate that UniRect-CoT can be easily integrated into existing UMMs, significantly enhancing generation quality across diverse complex tasks.",
          "authors": [
            "Yibo Jiang",
            "Tao Wu",
            "Rui Jiang",
            "Yehao Lu",
            "Chaoxiang Cai",
            "Zequn Qin",
            "Xi Li"
          ],
          "categories": [
            "cs.CV",
            "cs.AI"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13540v1",
          "abstract_url": "https://arxiv.org/abs/2604.13540v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13540v1",
          "published_at": "2026-04-15T06:41:56+00:00",
          "updated_at": "2026-04-15T06:41:56+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "方法"
          ],
          "topics": [
            "Reasoning",
            "Multimodal"
          ],
          "doi": null,
          "arxiv_id": "2604.13540",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13540v1"
          },
          "relevance_score": 78,
          "match_reasons": [
            "title matched \"multimodal\"",
            "summary matched \"diffusion\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13540"
        },
        {
          "title": "DiT as Real-Time Rerenderer: Streaming Video Stylization with Autoregressive Diffusion Transformer",
          "summary": "Recent advances in video generation models has significantly accelerated video generation and related downstream tasks. Among these, video stylization holds important research value in areas such as immersive applications and artistic creation, attracting widespread attention. However, existing diffusion-based video stylization methods struggle to maintain stability and consistency when processing long videos, and their high computational cost and multi-step denoising make them difficult to apply in practical scenarios. In this work, we propose RTR-DiT (DiT as Real-Time Rerenderer), a steaming video stylization framework built upon Diffusion Transformer. We first fine-tune a bidirectional teacher model on a curated video stylization dataset, supporting both text-guided and reference-guided video stylization tasks, and subsequently distill it into a few-step autoregressive model via post-training with Self Forcing and Distribution Matching Distillation. Furthermore, we propose a reference-preserving KV cache update strategy that not only enables stable and consistent processing of long videos, but also supports real-time switching between text prompts and reference images. Experimental results show that RTR-DiT outperforms existing methods in both text-guided and reference-guided video stylization tasks, in terms of quantitative metrics and visual quality, and demonstrates excellent performance in real-time long video stylization and interactive style-switching applications.",
          "authors": [
            "Hengye Lyu",
            "Zisu Li",
            "Yue Hong",
            "Yueting Weng",
            "Jiaxin Shi",
            "Hanwang Zhang",
            "Chen Liang"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13509v1",
          "abstract_url": "https://arxiv.org/abs/2604.13509v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13509v1",
          "published_at": "2026-04-15T05:52:43+00:00",
          "updated_at": "2026-04-15T05:52:43+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "Diffusion",
            "Video Generation"
          ],
          "doi": null,
          "arxiv_id": "2604.13509",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13509v1"
          },
          "relevance_score": 78,
          "match_reasons": [
            "title matched \"diffusion\"",
            "summary matched \"video generation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13509"
        },
        {
          "title": "Seedance 2.0: Advancing Video Generation for World Complexity",
          "summary": "Seedance 2.0 is a new native multi-modal audio-video generation model, officially released in China in early February 2026. Compared with its predecessors, Seedance 1.0 and 1.5 Pro, Seedance 2.0 adopts a unified, highly efficient, and large-scale architecture for multi-modal audio-video joint generation. This allows it to support four input modalities: text, image, audio, and video, by integrating one of the most comprehensive suites of multi-modal content reference and editing capabilities available in the industry to date. It delivers substantial, well-rounded improvements across all key sub-dimensions of video and audio generation. In both expert evaluations and public user tests, the model has demonstrated performance on par with the leading levels in the field. Seedance 2.0 supports direct generation of audio-video content with durations ranging from 4 to 15 seconds, with native output resolutions of 480p and 720p. For multi-modal inputs as reference, its current open platform supports up to 3 video clips, 9 images, and 3 audio clips. In addition, we provide Seedance 2.0 Fast version, an accelerated variant of Seedance 2.0 designed to boost generation speed for low-latency scenarios. Seedance 2.0 has delivered significant improvements to its foundational generation capabilities and multi-modal generation performance, bringing an enhanced creative experience for end users.",
          "authors": [
            "Team Seedance",
            "De Chen",
            "Liyang Chen",
            "Xin Chen",
            "Ying Chen",
            "Zhuo Chen",
            "Zhuowei Chen",
            "Feng Cheng",
            "Tianheng Cheng",
            "Yufeng Cheng",
            "Mojie Chi",
            "Xuyan Chi",
            "Jian Cong",
            "Qinpeng Cui",
            "Fei Ding",
            "Qide Dong",
            "Yujiao Du",
            "Haojie Duanmu",
            "Junliang Fan",
            "Jiarui Fang",
            "Jing Fang",
            "Zetao Fang",
            "Chengjian Feng",
            "Yu Gao",
            "Diandian Gu",
            "Dong Guo",
            "Hanzhong Guo",
            "Qiushan Guo",
            "Boyang Hao",
            "Hongxiang Hao",
            "Haoxun He",
            "Jiaao He",
            "Qian He",
            "Tuyen Hoang",
            "Heng Hu",
            "Ruoqing Hu",
            "Yuxiang Hu",
            "Jiancheng Huang",
            "Weilin Huang",
            "Zhaoyang Huang",
            "Zhongyi Huang",
            "Jishuo Jin",
            "Ming Jing",
            "Ashley Kim",
            "Shanshan Lao",
            "Yichong Leng",
            "Bingchuan Li",
            "Gen Li",
            "Haifeng Li",
            "Huixia Li",
            "Jiashi Li",
            "Ming Li",
            "Xiaojie Li",
            "Xingxing Li",
            "Yameng Li",
            "Yiying Li",
            "Yu Li",
            "Yueyan Li",
            "Chao Liang",
            "Han Liang",
            "Jianzhong Liang",
            "Ying Liang",
            "Wang Liao",
            "J. H. Lien",
            "Shanchuan Lin",
            "Xi Lin",
            "Feng Ling",
            "Yue Ling",
            "Fangfang Liu",
            "Jiawei Liu",
            "Jihao Liu",
            "Jingtuo Liu",
            "Shu Liu",
            "Sichao Liu",
            "Wei Liu",
            "Xue Liu",
            "Zuxi Liu",
            "Ruijie Lu",
            "Lecheng Lyu",
            "Jingting Ma",
            "Tianxiang Ma",
            "Xiaonan Nie",
            "Jingzhe Ning",
            "Junjie Pan",
            "Xitong Pan",
            "Ronggui Peng",
            "Xueqiong Qu",
            "Yuxi Ren",
            "Yuchen Shen",
            "Guang Shi",
            "Lei Shi",
            "Yinglong Song",
            "Fan Sun",
            "Li Sun",
            "Renfei Sun",
            "Wenjing Tang",
            "Boyang Tao",
            "Zirui Tao",
            "Dongliang Wang",
            "Feng Wang",
            "Hulin Wang",
            "Ke Wang",
            "Qingyi Wang",
            "Rui Wang",
            "Shuai Wang",
            "Shulei Wang",
            "Weichen Wang",
            "Xuanda Wang",
            "Yanhui Wang",
            "Yue Wang",
            "Yuping Wang",
            "Yuxuan Wang",
            "Zijie Wang",
            "Ziyu Wang",
            "Guoqiang Wei",
            "Meng Wei",
            "Di Wu",
            "Guohong Wu",
            "Hanjie Wu",
            "Huachao Wu",
            "Jian Wu",
            "Jie Wu",
            "Ruolan Wu",
            "Shaojin Wu",
            "Xiaohu Wu",
            "Xinglong Wu",
            "Yonghui Wu",
            "Ruiqi Xia",
            "Xin Xia",
            "Xuefeng Xiao",
            "Shuang Xu",
            "Bangbang Yang",
            "Jiaqi Yang",
            "Runkai Yang",
            "Tao Yang",
            "Yihang Yang",
            "Zhixian Yang",
            "Ziyan Yang",
            "Fulong Ye",
            "Bingqian Yi",
            "Xing Yin",
            "Yongbin You",
            "Linxiao Yuan",
            "Weihong Zeng",
            "Xuejiao Zeng",
            "Yan Zeng",
            "Siyu Zhai",
            "Zhonghua Zhai",
            "Bowen Zhang",
            "Chenlin Zhang",
            "Heng Zhang",
            "Jun Zhang",
            "Manlin Zhang",
            "Peiyuan Zhang",
            "Shuo Zhang",
            "Xiaohe Zhang",
            "Xiaoying Zhang",
            "Xinyan Zhang",
            "Xinyi Zhang",
            "Yichi Zhang",
            "Zixiang Zhang",
            "Haiyu Zhao",
            "Huating Zhao",
            "Liming Zhao",
            "Yian Zhao",
            "Guangcong Zheng",
            "Jianbin Zheng",
            "Xiaozheng Zheng",
            "Zerong Zheng",
            "Kuan Zhu",
            "Feilong Zuo"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14148v1",
          "abstract_url": "https://arxiv.org/abs/2604.14148v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14148v1",
          "published_at": "2026-04-15T17:59:40+00:00",
          "updated_at": "2026-04-15T17:59:40+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Evaluation",
            "Video Generation"
          ],
          "doi": null,
          "arxiv_id": "2604.14148",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14148v1"
          },
          "relevance_score": 72,
          "match_reasons": [
            "title matched \"video generation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14148"
        },
        {
          "title": "POINTS-Seeker: Towards Training a Multimodal Agentic Search Model from Scratch",
          "summary": "While Large Multimodal Models (LMMs) demonstrate impressive visual perception, they remain epistemically constrained by their static parametric knowledge. To transcend these boundaries, multimodal search models have been adopted to actively interact with the external environment for evidence retrieval. Diverging from prevailing paradigms that merely retrofit general LMMs with search tools as modular extensions, we explore the potential of building a multimodal agentic search model from scratch. Specifically, we make the following contributions: (i) we introduce Agentic Seeding, a dedicated phase designed to weave the foundational precursors necessary for eliciting agentic behaviors; (ii) we uncover a performance bottleneck in long-horizon interactions, where the increasing volume of interaction history overwhelms the model's ability to locate ground-truth evidence. To mitigate this, we propose V-Fold, an adaptive history-aware compression scheme that preserves recent dialogue turns in high fidelity while folding historical context into the visual space via rendering; and (iii) we develop POINTS-Seeker-8B, a state-of-the-art multimodal agentic search model that consistently outperforms existing models across six diverse benchmarks, effectively resolving the challenges of long-horizon, knowledge-intensive visual reasoning.",
          "authors": [
            "Yikun Liu",
            "Yuan Liu",
            "Le Tian",
            "Xiao Zhou",
            "Jiangchao Yao",
            "Yanfeng Wang",
            "Weidi Xie"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.14029v1",
          "abstract_url": "https://arxiv.org/abs/2604.14029v1",
          "pdf_url": "https://arxiv.org/pdf/2604.14029v1",
          "published_at": "2026-04-15T16:09:37+00:00",
          "updated_at": "2026-04-15T16:09:37+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": null,
          "arxiv_id": "2604.14029",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.14029v1"
          },
          "relevance_score": 70,
          "match_reasons": [
            "title matched \"multimodal\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.14029"
        },
        {
          "title": "Remote Sensing Image Super-Resolution for Imbalanced Textures: A Texture-Aware Diffusion Framework",
          "summary": "Generative diffusion priors have recently achieved state-of-the-art performance in natural image super-resolution, demonstrating a powerful capability to synthesize photorealistic details. However, their direct application to remote sensing image super-resolution (RSISR) reveals significant shortcomings. Unlike natural images, remote sensing images exhibit a unique texture distribution where ground objects are globally stochastic yet locally clustered, leading to highly imbalanced textures. This imbalance severely hinders the model's spatial perception. To address this, we propose TexADiff, a novel framework that begins by estimating a Relative Texture Density Map (RTDM) to represent the texture distribution. TexADiff then leverages this RTDM in three synergistic ways: as an explicit spatial conditioning to guide the diffusion process, as a loss modulation term to prioritize texture-rich regions, and as a dynamic adapter for the sampling schedule. These modifications are designed to endow the model with explicit texture-aware capabilities. Experiments demonstrate that TexADiff achieves superior or competitive quantitative metrics. Furthermore, qualitative results show that our model generates faithful high-frequency details while effectively suppressing texture hallucinations. This improved reconstruction quality also results in significant gains in downstream task performance. The source code of our method can be found at https://github.com/ZezFuture/TexAdiff.",
          "authors": [
            "Enzhuo Zhang",
            "Sijie Zhao",
            "Dilxat Muhtar",
            "Zhenshi Li",
            "Xueliang Zhang",
            "Pengfeng Xiao"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13994v1",
          "abstract_url": "https://arxiv.org/abs/2604.13994v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13994v1",
          "published_at": "2026-04-15T15:36:49+00:00",
          "updated_at": "2026-04-15T15:36:49+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Diffusion"
          ],
          "doi": null,
          "arxiv_id": "2604.13994",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13994v1"
          },
          "relevance_score": 69,
          "match_reasons": [
            "title matched \"diffusion\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13994"
        },
        {
          "title": "Blind Bitstream-corrupted Video Recovery via Metadata-guided Diffusion Model",
          "summary": "Bitstream-corrupted video recovery aims to restore realistic content degraded during video storage or transmission. Existing methods typically assume that predefined masks of corrupted regions are available, but manually annotating these masks is labor-intensive and impractical in real-world scenarios. To address this limitation, we introduce a new blind video recovery setting that removes the reliance on predefined masks. This setting presents two major challenges: accurately identifying corrupted regions and recovering content from extensive and irregular degradations. We propose a Metadata-Guided Diffusion Model (M-GDM) to tackle these challenges. Specifically, intrinsic video metadata are leveraged as corruption indicators through a dual-stream metadata encoder that separately embeds motion vectors and frame types before fusing them into a unified representation. This representation interacts with corrupted latent features via cross-attention at each diffusion step. To preserve intact regions, we design a prior-driven mask predictor that generates pseudo masks using both metadata and diffusion priors, enabling the separation and recombination of intact and recovered regions through hard masking. To mitigate boundary artifacts caused by imperfect masks, a post-refinement module enhances consistency between intact and recovered regions. Extensive experiments demonstrate the effectiveness of our method and its superiority in blind video recovery. Code is available at: https://github.com/Shuyun-Wang/M-GDM.",
          "authors": [
            "Shuyun Wang",
            "Hu Zhang",
            "Xin Shen",
            "Dadong Wang",
            "Xin Yu"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13906v1",
          "abstract_url": "https://arxiv.org/abs/2604.13906v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13906v1",
          "published_at": "2026-04-15T14:15:07+00:00",
          "updated_at": "2026-04-15T14:15:07+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "应用",
            "方法"
          ],
          "topics": [
            "Diffusion"
          ],
          "doi": null,
          "arxiv_id": "2604.13906",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13906v1"
          },
          "relevance_score": 68,
          "match_reasons": [
            "title matched \"diffusion\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13906"
        },
        {
          "title": "PBE-UNet: A light weight Progressive Boundary-Enhanced U-Net with Scale-Aware Aggregation for Ultrasound Image Segmentation",
          "summary": "Accurate lesion segmentation in ultrasound images is essential for preventive screening and clinical diagnosis, yet remains challenging due to low contrast, blurry boundaries, and significant scale variations. Although existing deep learning-based methods have achieved remarkable performance, these methods still struggle with scale variations and indistinct tumor boundaries. To address these challenges, we propose a progressive boundary enhanced U-Net (PBE-UNet). Specially, we first introduce a scale-aware aggregation module (SAAM) that dynamically adjusts its receptive field to capture robust multi-scale contextual information. Then, we propose a boundary-guided feature enhancement (BGFE) module to enhance the feature representations. We find that there are large gaps between the narrow boundary and the wide segmentation error areas. Unlike existing methods that treat boundaries as static masks, the BGFE module progressively expands the narrow boundary prediction into broader spatial attention maps. Thus, broader spatial attention maps could effectively cover the wider segmentation error regions and enhance the model's focus on these challenging areas. We conduct expensive experiments on four benchmark ultrasound datasets, BUSI, Dataset B, TN3K, and BP. The experimental results how that our proposed PBE-UNet outperforms state-of-the-art ultrasound image segmentation methods. The code is at https://github.com/cruelMouth/PBE-UNet.",
          "authors": [
            "Chen Wang",
            "Yixin Zhu",
            "Yongbin Zhu",
            "Fengyuan Shi",
            "Qi Li",
            "Jun Wang",
            "Zuozhu Liu",
            "Keli Hu"
          ],
          "categories": [
            "cs.CV"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13791v1",
          "abstract_url": "https://arxiv.org/abs/2604.13791v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13791v1",
          "published_at": "2026-04-15T12:31:20+00:00",
          "updated_at": "2026-04-15T12:31:20+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Clinical"
          ],
          "doi": null,
          "arxiv_id": "2604.13791",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13791v1"
          },
          "relevance_score": 66,
          "match_reasons": [
            "title matched \"segmentation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13791"
        },
        {
          "title": "Design and Behavior of Sparse Mixture-of-Experts Layers in CNN-based Semantic Segmentation",
          "summary": "Sparse mixture-of-experts (MoE) layers have been shown to substantially increase model capacity without a proportional increase in computational cost and are widely used in transformer architectures, where they typically replace feed-forward network blocks. In contrast, integrating sparse MoE layers into convolutional neural networks (CNNs) remains inconsistent, with most prior work focusing on fine-grained MoEs operating at the filter or channel levels. In this work, we investigate a coarser, patch-wise formulation of sparse MoE layers for semantic segmentation, where local regions are routed to a small subset of convolutional experts. Through experiments on the Cityscapes and BDD100K datasets using encoder-decoder and backbone-based CNNs, we conduct a design analysis to assess how architectural choices affect routing dynamics and expert specialization. Our results demonstrate consistent, architecture-dependent improvements (up to +3.9 mIoU) with little computational overhead, while revealing strong design sensitivity. Our work provides empirical insights into the design and internal dynamics of sparse MoE layers in CNN-based dense prediction. Our code is available at https://github.com/KASTEL-MobilityLab/moe-layers/.",
          "authors": [
            "Svetlana Pavlitska",
            "Haixi Fan",
            "Konstantin Ditschuneit",
            "J. Marius Zöllner"
          ],
          "categories": [
            "cs.CV",
            "cs.LG"
          ],
          "paper_id": "http://arxiv.org/abs/2604.13761v1",
          "abstract_url": "https://arxiv.org/abs/2604.13761v1",
          "pdf_url": "https://arxiv.org/pdf/2604.13761v1",
          "published_at": "2026-04-15T11:47:34+00:00",
          "updated_at": "2026-04-15T11:47:34+00:00",
          "source": "arxiv",
          "date_label": "Published",
          "analysis": null,
          "tags": [
            "数据",
            "方法"
          ],
          "topics": [
            "Segmentation"
          ],
          "doi": null,
          "arxiv_id": "2604.13761",
          "source_variants": [
            "arxiv"
          ],
          "source_urls": {
            "arxiv": "https://arxiv.org/abs/2604.13761v1"
          },
          "relevance_score": 66,
          "match_reasons": [
            "title matched \"segmentation\"",
            "has PDF",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "arxiv:2604.13761"
        }
      ]
    },
    {
      "name": "PubMed AI",
      "key_points": [
        "《Augmenting Large Language Model With Prompt Engineering and Supervised Fine-Tuning in Non-Small Cell Lung Cancer Tumor-Node-Metastasis Staging: Framework Development and Validation.》〔评测 / 数据 / 应用 / 方法〕：BACKGROUND: Accurate tumor node metastasis (TNM) staging is fundamental for treatment planning and prognosis in non-small cell lung cancer (NSCLC). However, it…",
        "《PKFAR: psychiatry knowledge-fused augmented reasoning with large language models.》〔评测 / 应用 / 方法〕：PURPOSE: Psychiatric diagnosis faces significant challenges due to subjective symptom reporting and complex diagnostic criteria. While Large Language Models (L…",
        "《Fact-Checking Large Language Model Responses to a Health Care Prompt: Comparative Study.》〔评测 / 应用 / 方法〕：BACKGROUND: Large language models use machine learning to produce natural language. These models have a range of potential applications in health care, such as…"
      ],
      "sort_by": "hybrid",
      "papers": [
        {
          "title": "Augmenting Large Language Model With Prompt Engineering and Supervised Fine-Tuning in Non-Small Cell Lung Cancer Tumor-Node-Metastasis Staging: Framework Development and Validation.",
          "summary": "BACKGROUND: Accurate tumor node metastasis (TNM) staging is fundamental for treatment planning and prognosis in non-small cell lung cancer (NSCLC). However, its complexity poses significant challenges. Traditional rule-based natural language processing methods are constrained by their reliance on manually crafted rules and are susceptible to inconsistencies in clinical reporting. OBJECTIVE: This study aimed to develop and validate a robust, accurate, and operationally efficient artificial intelligence framework for the TNM staging of NSCLC by strategically enhancing a large language model, GLM-4-Air (general language model), through advanced prompt engineering and supervised fine-tuning (SFT). METHODS: We constructed a curated dataset of 492 deidentified real-world medical imaging reports, with TNM staging annotations rigorously validated by senior physicians according to the AJCC (American Joint Committee on Cancer) 8th edition guidelines. The GLM-4-Air model was systematically optimized via a multi-phase process: iterative prompt engineering incorporating chain-of-thought reasoning and domain knowledge injection for all staging tasks, followed by parameter-efficient SFT using low-rank adaptation for the reasoning-intensive primary tumor characteristics (T) and regional lymph node involvement (N) staging tasks. The final hybrid model was evaluated on a completely held-out test set (black-box) and benchmarked against GPT-4o using standard metrics, statistical tests, and a clinical impact analysis of staging errors. RESULTS: The optimized hybrid GLM-4-Air model demonstrated reliable performance. It achieved higher staging accuracies on the black-box test set: 92% (95% CI 0.850-0.959) for T, 86% (95% CI 0.779-0.915) for N, 92% (95% CI 0.850-0.959) for distant metastasis status (M), and 90% for overall clinical staging; by comparison, GPT-4o attained 87% (95% CI 0.790-0.922), 70% (95% CI 0.604-0.781), 78% (95% CI 0.689-0.850), and 80%, respectively. The model's robustness was further evidenced by its macro-average F1-scores of 0.914 (T), 0.815 (N), and 0.831 (M), consistently surpassing those of GPT-4o (0.836, 0.620, and 0.698). Analysis of confusion matrices confirmed the model's proficiency in identifying critical staging features while effectively minimizing false negatives. Crucially, the clinical impact assessment showed a substantial reduction in severe category I errors, which are defined as misclassifications that could significantly influence subsequent clinical decisions. Our model committed 0 category I errors in M staging and fewer category I errors in T and N staging. Furthermore, the framework demonstrated practical deployability, achieving efficient inference on consumer-grade hardware (eg, 4 RTX 4090 GPUs) with latencies suitable and acceptable for clinical workflows. CONCLUSIONS: The proposed hybrid framework, integrating structured prompt engineering and applying SFT to reasoning-heavy tasks (T/N), enables the GLM-4-Air model to serve as a highly accurate, clinically reliable, and cost-efficient solution for automated NSCLC TNM staging. This work demonstrates the efficacy and potential of a domain-optimized smaller model compared with an off-the-shelf generalist model, holding promise for enhancing diagnostic standardization in resource-aware health care environments.",
          "authors": [
            "Ruonan Jin",
            "Chao Ling",
            "Yixuan Hou",
            "Yuhan Sun",
            "Ning Li",
            "Jiefei Han",
            "Jin Sheng",
            "Qizhao Wang",
            "Yuepeng Liu",
            "Shen Zheng",
            "Xingyu Ren",
            "Chiyu Chen",
            "Jue Wang",
            "Cheng Li"
          ],
          "categories": [
            "Journal Article"
          ],
          "paper_id": "pubmed:41984624",
          "abstract_url": "https://pubmed.ncbi.nlm.nih.gov/41984624/",
          "pdf_url": null,
          "published_at": "2026-04-15T12:53:00+00:00",
          "updated_at": "2026-04-15T12:53:00+00:00",
          "source": "pubmed",
          "date_label": "Entered",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": "10.2196/77988",
          "arxiv_id": null,
          "source_variants": [
            "pubmed"
          ],
          "source_urls": {
            "pubmed": "https://pubmed.ncbi.nlm.nih.gov/41984624/",
            "doi": "https://doi.org/10.2196/77988"
          },
          "relevance_score": 107,
          "match_reasons": [
            "title matched \"language model\"",
            "summary matched \"benchmark\"",
            "summary matched \"clinical\"",
            "has DOI",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.2196/77988"
        },
        {
          "title": "PKFAR: psychiatry knowledge-fused augmented reasoning with large language models.",
          "summary": "PURPOSE: Psychiatric diagnosis faces significant challenges due to subjective symptom reporting and complex diagnostic criteria. While Large Language Models (LLMs) offer potential clinical decision support, their implementation is hindered by privacy constraints on commercial models (e.g., GPT-o3, Gemini-2.5) and computational demands of massive-scale open-source alternatives (e.g., DeepSeek-R1). These constraints necessitate knowledge-enhanced approaches with smaller-scale LLMs as the primary research direction. However, existing methods fail to adequately address psychiatric complexities, necessitating a specialized solution for accurate diagnostic support. METHODS: We propose PKFAR (psychiatry knowledge-fused augmented reasoning), which incorporates two features: (1) PsychKG, a semantically-augmented psychiatric knowledge graph integrating psychiatric criteria with both node and relation descriptors, and (2) a three-stage hierarchical reasoning framework comprising symptom comprehension, disorder retrieval, and diagnosis reasoning. The system is evaluated on Mentat, MedQA_psychiatry, and MIMIC benchmarks using Qwen3-8B. RESULTS: Compared to standard one-shot CoT reasoning baselines, PKFAR achieves 12.4, 7.5, and 10.0% accuracy improvements using Qwen3-8B in zero-shot settings on Mentat, MedQA_psychiatry, and MIMIC respectively. Notably, our approach outperforms one-shot CoT performance of both GPT-o3 and DeepSeek-V3, while approaching the accuracy of DeepSeek-R1. CONCLUSION: Our knowledge-fused approach establishes an effective balance between computational efficiency and diagnostic precision in psychiatric applications. PKFAR's structured reasoning pathway and semantically-augmented knowledge method address critical limitations in current LLM-based clinical support systems, offering a practical solution for psychiatric diagnostics.",
          "authors": [
            "Rongzheng Wang",
            "Cheng Yu",
            "Qian Dong",
            "Jianqing Qiu",
            "Tao Wen",
            "Wei Zhang",
            "Ke Qin"
          ],
          "categories": [
            "Journal Article"
          ],
          "paper_id": "pubmed:41982804",
          "abstract_url": "https://pubmed.ncbi.nlm.nih.gov/41982804/",
          "pdf_url": null,
          "published_at": "2026-04-15T04:30:00+00:00",
          "updated_at": "2026-04-15T04:30:00+00:00",
          "source": "pubmed",
          "date_label": "Entered",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": "10.1007/s13755-026-00447-w",
          "arxiv_id": null,
          "source_variants": [
            "pubmed"
          ],
          "source_urls": {
            "pubmed": "https://pubmed.ncbi.nlm.nih.gov/41982804/",
            "doi": "https://doi.org/10.1007/s13755-026-00447-w"
          },
          "relevance_score": 98,
          "match_reasons": [
            "title matched \"language model\"",
            "summary matched \"benchmark\"",
            "summary matched \"clinical\"",
            "has DOI",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.1007/s13755-026-00447-w"
        },
        {
          "title": "Fact-Checking Large Language Model Responses to a Health Care Prompt: Comparative Study.",
          "summary": "BACKGROUND: Large language models use machine learning to produce natural language. These models have a range of potential applications in health care, such as patient education and diagnosis. However, evaluations of large language models in health care are still scarce. OBJECTIVE: This study aimed to (1) evaluate the accuracy and efficiency of automated fact-checking by 2 large language models and (2) illustrate a process through which a large language model might support a patient in redrafting a prompt to include key information needed for patient safety. METHODS: A parallel comparison of 2 large language models and 3 human experts was conducted. A clinical scenario was devised in which a woman aged 23 years questions the safety of retinoid treatment for acne by sending prompts to 2 large language models (GPT-4o and OpenBioLLM-70B). GPT-4o and OpenBioLLM-70B were asked to suggest improvements to the patient's initial prompt to elicit key information for clinical decision-making. After the patient sent the revised prompt to the large language models, the models were then asked to fact-check the final response. To test the generalizability of automated fact-checking, a set of 20 clinical statements on disparate topics, mostly related to drug indications, contraindications, and side effects, was developed. The large language models also fact-checked these 20 medical statements. The results were compared against the evaluations of 3 clinical experts. The outcome measures were as follows: (1) percentage of accuracy of automated fact-checking, (2) time to complete fact-checking, and (3) a binary outcome for prompt redrafting (advising the patient to revise her prompt by naming her acne medication to address safety concerns). RESULTS: For the scenario of a patient with acne, GPT-4o and OpenBioLLM-70B both had 86% agreement with the clinical experts' fact-checking. The large language models did not consistently convey the urgency of discontinuing isotretinoin treatment when pregnancy is suspected. In addition, the models did not adequately convey the importance of folic acid supplementation during pregnancy. For the set of 20 medical claims, GPT-4o fact-checking had 100% agreement with that of human experts, whereas OpenBioLLM-70B had 95% agreement. OpenBioLLM-70B diverged from human experts and GPT-4o on 1 question related to pediatric use of antihistamines. The expert fact-checks took a mean time of 18 (SD 3.74) minutes, GPT-4o took 42 seconds, and OpenBioLLM-70B took 33 minutes. The GPT-4o responses for the acne scenario had some inconsistencies but zero fabrication and no obvious omissions. In contrast, OpenBioLLM-70B omitted 1 key information item needed for patient safety. CONCLUSIONS: GPT-4o can interact with patients to improve the quality and comprehensiveness of the information contained in health-related prompts. GPT-4o and OpenBioLLM-70B can conduct efficient fact-checking that is close to the level of accuracy of human experts. Human experts need to perform additional checks for accuracy and safety.",
          "authors": [
            "Padhraig Ryan",
            "Orla Davoren",
            "Glyn Elwyn"
          ],
          "categories": [
            "Journal Article",
            "Comparative Study"
          ],
          "paper_id": "pubmed:41985066",
          "abstract_url": "https://pubmed.ncbi.nlm.nih.gov/41985066/",
          "pdf_url": null,
          "published_at": "2026-04-15T14:52:00+00:00",
          "updated_at": "2026-04-15T14:52:00+00:00",
          "source": "pubmed",
          "date_label": "Entered",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "Evaluation"
          ],
          "doi": "10.2196/68223",
          "arxiv_id": null,
          "source_variants": [
            "pubmed"
          ],
          "source_urls": {
            "pubmed": "https://pubmed.ncbi.nlm.nih.gov/41985066/",
            "doi": "https://doi.org/10.2196/68223"
          },
          "relevance_score": 91,
          "match_reasons": [
            "title matched \"language model\"",
            "summary matched \"clinical\"",
            "has DOI",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.2196/68223"
        },
        {
          "title": "Fine-Tuned Large Language Models for Automated Radiology Impression Generation: A Multicenter Evaluation.",
          "summary": "Purpose To develop a fine-tuned large language model (Medical Imaging Report Assistant, MIRA) and evaluate its performance in generating radiology impressions from multicenter data with respect to accuracy, reporting efficiency, and clinical applicability. Materials and Methods A retrospective multicenter dataset comprising 1.87 million radiology reports (including CT, MRI, and digital radiography data) from 42 hospitals across 22 provinces in China (January 2019 to August 2024) was compiled. The dataset was used to fine-tune an LLM via a prompt-based strategy. The evaluation framework incorporated both automated and human evaluation metrics. Radiologists evaluated internal and external datasets and three open-source datasets to compare impressions generated by the fine-tuned LLM and GPT-4o. Twenty-four radiologists from six centers performed blinded comparisons of MIRA generated and reference impressions to assess interrater consistency and drafting efficiency. Data were analyzed using appropriate parametric/nonparametric tests and χ 2 tests, with Holm-Bonferroni correction for multiple comparisons. Results The internal test set included data for 78,544 reports, median age, 52 years [IQR, 35-65], 39,351 males) and the external test set included data for (27,471 reports, median age, 53 years [IQR, 37-66], 13,955 males). Site/modality-aware prompting improved similarity ( P < .001): internal BERTScore-F/Sentence Similarity 0.92/0.92, external 0.82/0.80 under optimal settings; human evaluation ( n = 2,327) showed MIRA beat GPT-4o on both similarity and F1 score ( P < .001). MIRA-generated impressions were rated as at least as good as the reference impressions in 69.0% of blinded comparisons (1,657/2,400), reduced draft time by 0.46 min per report, and increased interradiologist agreement ( P < .001). Conclusion MIRA, a fine-tuned LLM using a prompt-based strategy, generated clinically aligned radiology impressions in multicenter settings, improving accuracy, efficiency, and reporting consistency. © The Authors 2026. Published by the Radiological Society of North America under a CC BY 4.0 license.",
          "authors": [
            "Mingyang Li",
            "Yaning Wang",
            "Zheng Miao",
            "Jiaqi Gong",
            "Simin Yang",
            "Han Xue",
            "Qi Yang",
            "Lijun Duan",
            "Lin Mu",
            "Ying Mu",
            "Kai Zhu",
            "Qi Dai",
            "Munire Aihemaiti",
            "Yunhui Yang",
            "Liang Liu",
            "Yingyan Zheng",
            "Yang Hou",
            "Lei Zhang",
            "Jing Wang",
            "Huimao Zhang"
          ],
          "categories": [
            "Journal Article"
          ],
          "paper_id": "pubmed:41983921",
          "abstract_url": "https://pubmed.ncbi.nlm.nih.gov/41983921/",
          "pdf_url": null,
          "published_at": "2026-04-15T10:03:00+00:00",
          "updated_at": "2026-04-15T10:03:00+00:00",
          "source": "pubmed",
          "date_label": "Entered",
          "analysis": null,
          "tags": [
            "评测",
            "数据",
            "应用",
            "方法"
          ],
          "topics": [
            "Language Model",
            "Evaluation"
          ],
          "doi": "10.1148/ryai.250714",
          "arxiv_id": null,
          "source_variants": [
            "pubmed"
          ],
          "source_urls": {
            "pubmed": "https://pubmed.ncbi.nlm.nih.gov/41983921/",
            "doi": "https://doi.org/10.1148/ryai.250714"
          },
          "relevance_score": 86,
          "match_reasons": [
            "title matched \"language model\"",
            "summary matched \"clinical\"",
            "has DOI",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.1148/ryai.250714"
        },
        {
          "title": "A Multi-AI Agent Framework for Interactive Neurosurgical Education and Evaluation: From Vignettes to Virtual Conversations.",
          "summary": "BACKGROUND AND OBJECTIVES: Traditional medical board examinations present clinical information in static vignettes with multiple-choices (MC), fundamentally different from how physicians gather and integrate data in practice. Recent advances in large language models (LLMs) offer promising approaches to creating more realistic clinical interactive conversations. However, these approaches are limited in neurosurgery, where patient communication capacity varies significantly and diagnosis heavily relies on objective data such as imaging and neurological examinations. We aimed to develop and evaluate a multi-artificial intelligence (AI) agent conversation framework for neurosurgical case assessment that enables realistic clinical interactions through simulated patients and structured access to objective clinical data. METHODS: We developed a framework to convert 608 Self-Assessment in Neurological Surgery first-order diagnosis questions into conversation sessions using 3 specialized AI agents: patient AI for subjective information, system AI for objective data, and clinical AI for diagnostic reasoning. We evaluated generative pretrained transformer 4o's (GPT-4o's) diagnostic accuracy across traditional vignettes, patient-only conversations, and patient + system AI interactions, with human benchmark testing from 10 neurosurgery residents. RESULTS: GPT-4o showed significant performance drops from traditional vignettes to conversational formats in both MC (89.0%-60.9%, P < .0001) and free-response scenarios (78.4%-30.3%, P < .0001). Adding access to objective data through system AI improved performance (to 67.4%, P = .0015; and 61.8%, P < .0001, respectively). Questions requiring image interpretation showed similar patterns but lower accuracy. Residents outperformed GPT-4o in free-response conversations (70.0% vs 28.3%, P = .0030) using fewer interactions and reported high educational value of the interactive format. CONCLUSION: This multi-AI agent framework provides both a more challenging evaluation method for LLMs and an engaging educational tool for neurosurgical training. The significant performance drops in conversational formats suggest that traditional MC testing may overestimate LLMs' clinical reasoning capabilities, while the framework's interactive nature offers promising applications for enhancing medical education.",
          "authors": [
            "Karl L Sangwon",
            "Jeff Zhang",
            "Robert Steele",
            "Jaden Stryker",
            "Joanne J Choi",
            "Jin Vivian Lee",
            "Daniel Alexander Alber",
            "Aly Valliani",
            "Nivedha Kannapadi",
            "James Ryoo",
            "Austin Feng",
            "Hammad A Khan",
            "Sean Neifert",
            "Cordelia Orillac",
            "Hannah K Weiss",
            "Nora C Kim",
            "David Kurland",
            "Howard A Riina",
            "Douglas Kondziolka",
            "Michal Mankowski",
            "Eric Karl Oermann"
          ],
          "categories": [
            "Journal Article"
          ],
          "paper_id": "pubmed:41982325",
          "abstract_url": "https://pubmed.ncbi.nlm.nih.gov/41982325/",
          "pdf_url": null,
          "published_at": "2026-04-15T04:24:00+00:00",
          "updated_at": "2026-04-15T04:24:00+00:00",
          "source": "pubmed",
          "date_label": "Entered",
          "analysis": null,
          "tags": [
            "评测",
            "应用",
            "方法"
          ],
          "topics": [
            "Benchmark",
            "Reasoning"
          ],
          "doi": "10.1227/neuprac.0000000000000217",
          "arxiv_id": null,
          "source_variants": [
            "pubmed"
          ],
          "source_urls": {
            "pubmed": "https://pubmed.ncbi.nlm.nih.gov/41982325/",
            "doi": "https://doi.org/10.1227/neuprac.0000000000000217"
          },
          "relevance_score": 76,
          "match_reasons": [
            "summary matched \"language model\"",
            "summary matched \"benchmark\"",
            "summary matched \"clinical\"",
            "has DOI",
            "has rich summary",
            "has complete metadata"
          ],
          "feedback_status": null,
          "feedback_note": null,
          "feedback_next_action": null,
          "feedback_due_date": null,
          "feedback_snoozed_until": null,
          "feedback_review_interval_days": null,
          "canonical_id": "doi:10.1227/neuprac.0000000000000217"
        }
      ]
    },
    {
      "name": "OpenAlex AI",
      "key_points": [],
      "sort_by": "hybrid",
      "papers": []
    }
  ]
}