大海捞针测试:检测长上下文模型的召回能力
FreeGuideOnline
最新
2026-06-22
python import tiktoken
干草堆文本生成(重复填充句)
haystack_sentence = "The quick brown fox jumps over the lazy dog. " needle = "The special magic number is 2813308004."
def generate_context(target_length, needle_position): enc = tiktoken.get_encoding("cl100k_base") needle_tokens = len(enc.encode(needle)) filler_tokens = target_length - needle_tokens # 将填充文本重复到所需长度 filler_text = (haystack_sentence * (filler_tokens // len(enc.encode(haystack_sentence)) + 1)) filler_encoded = enc.encode(filler_text)[:filler_tokens] context_tokens = [] # 按百分比插入针 insert_at = int(len(filler_encoded) * needle_position) context_tokens = filler_encoded[:insert_at] + enc.encode(needle) + filler_encoded[insert_at:] return enc.decode(context_tokens)