
{"id":3786,"date":"2024-12-10T19:28:48","date_gmt":"2024-12-10T11:28:48","guid":{"rendered":"https:\/\/infernews.com\/?p=3786"},"modified":"2024-12-10T19:33:09","modified_gmt":"2024-12-10T11:33:09","slug":"rag-%e5%90%91%e9%87%8f%e4%b8%ad%e7%9a%84%e6%96%87%e4%bb%b6%e5%88%86%e5%89%b2%e7%ad%96%e7%95%a5","status":"publish","type":"post","link":"https:\/\/infernews.com\/blog\/rag-%e5%90%91%e9%87%8f%e4%b8%ad%e7%9a%84%e6%96%87%e4%bb%b6%e5%88%86%e5%89%b2%e7%ad%96%e7%95%a5\/","title":{"rendered":"RAG \u5411\u91cf\u4e2d\u7684\u6587\u4ef6\u5206\u5272\u7b56\u7565"},"content":{"rendered":"\n<p>\u6587\u4ef6\u5206\u5272\u7b56\u7565\u5c0d\u65bc\u5927\u578b\u8a9e\u8a00\u6a21\u578b(LLM)\u7684\u8cc7\u8a0a\u6aa2\u7d22\u6548\u80fd\u5f71\u97ff\u3002\u73fe\u6709\u8cc7\u8a0a\u6aa2\u7d22\u57fa\u6e96\u6e2c\u8a66\u901a\u5e38\u4ee5\u6574\u7bc7\u6587\u4ef6\u7684\u76f8\u95dc\u6027\u8a55\u4f30\u6548\u80fd\uff0c\u5ffd\u7565\u4e86\u6587\u4ef6\u5206\u5272\u7b56\u7565\u7684\u91cd\u8981\u6027\u3002RAG \u6587\u4ef6\u5206\u5272(Text Chunking) \u57fa\u65bc\u5b57\u5143\/\u8a5e(Token) \u7684\u5206\u5272\u3001\u905e\u8ff4\u5f0f\u5206\u5272\u3001\u8a9e\u7fa9\u5206\u5272\u7b49\uff0c\u4ee5\u627e\u51fa\u6700\u9069\u5408\u7528\u65bc\u5f8c\u7e8c\u57fa\u65bc\u6aa2\u7d22\u589e\u5f37\u751f\u6210\uff08RAG\uff09\u61c9\u7528\u7684\u6700\u4f73\u65b9\u6cd5\u3002<\/p>\n\n\n<figure class=\"wp-block-embed-youtube wp-block-embed is-type-video is-provider-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio\"><div class=\"lyte-wrapper\" title=\"The BEST Way to Chunk Text for RAG\" style=\"width:853px;max-width:100%;margin:5px auto;\"><div class=\"lyMe\" id=\"WYL_Pk2BeaGbcTE\" itemprop=\"video\" itemscope itemtype=\"https:\/\/schema.org\/VideoObject\"><div><meta itemprop=\"thumbnailUrl\" content=\"https:\/\/infernews.com\/blog\/wp-content\/plugins\/wp-youtube-lyte\/lyteCache.php?origThumbUrl=https%3A%2F%2Fi.ytimg.com%2Fvi%2FPk2BeaGbcTE%2Fhqdefault.jpg\" \/><meta itemprop=\"embedURL\" content=\"https:\/\/www.youtube.com\/embed\/Pk2BeaGbcTE\" \/><meta itemprop=\"duration\" content=\"PT33M17S\" \/><meta itemprop=\"uploadDate\" content=\"2024-12-09T13:00:34Z\" \/><\/div><div id=\"lyte_Pk2BeaGbcTE\" data-src=\"https:\/\/infernews.com\/blog\/wp-content\/plugins\/wp-youtube-lyte\/lyteCache.php?origThumbUrl=https%3A%2F%2Fi.ytimg.com%2Fvi%2FPk2BeaGbcTE%2Fhqdefault.jpg\" class=\"pL\"><div class=\"tC\"><div class=\"tT\" itemprop=\"name\">The BEST Way to Chunk Text for RAG<\/div><\/div><div class=\"play\"><\/div><div class=\"ctrl\"><div class=\"Lctrl\"><\/div><div class=\"Rctrl\"><\/div><\/div><\/div><noscript><a href=\"https:\/\/youtu.be\/Pk2BeaGbcTE\" rel=\"nofollow\"><img loading=\"lazy\" decoding=\"async\" src=\"https:\/\/infernews.com\/blog\/wp-content\/plugins\/wp-youtube-lyte\/lyteCache.php?origThumbUrl=https%3A%2F%2Fi.ytimg.com%2Fvi%2FPk2BeaGbcTE%2F0.jpg\" alt=\"The BEST Way to Chunk Text for RAG\" width=\"853\" height=\"460\" \/><br \/>Watch this video on YouTube<\/a><\/noscript><meta itemprop=\"description\" content=\"To try everything Brilliant has to offer\u2014free\u2014for a full 30 days, visit https:\/\/brilliant.org\/AdamLucek\/ You\u2019ll also get 20% off an annual premium subscription! Resources: Chunking Notebook: https:\/\/github.com\/ALucek\/chunking-strategies ChromaDB Technical Report: https:\/\/research.trychroma.com\/evaluating-chunking ChromaDB Report Repo: https:\/\/github.com\/brandonstarxel\/chunking_evaluation OpenAI Token Visualizer: https:\/\/platform.openai.com\/tokenizer Greg Kamradt 5 Levels of Text Splitting: https:\/\/github.com\/FullStackRetrieval-com\/RetrievalTutorials\/blob\/main\/tutorials\/LevelsOfTextSplitting\/5_Levels_Of_Text_Splitting.ipynb Jaccard Index: https:\/\/en.wikipedia.org\/wiki\/Jaccard_index Chapters: 00:00 - Background on Text Chunking 02:28 - Brilliant! 03:47 - Character Text Splitting 06:28 - Token Text Splitting 10:26 - Recursive Character\/Token Splitting 16:07 - Kamradt &amp; Modified Semantic Chunking 20:43 - Cluster Semantic Chunking 24:46 - LLM Semantic Chunking 27:56 - Chunking Metrics &amp; Comparison 30:00 - Overall Findings #ai #programming #datascience This video is sponsored by Brilliant\"><\/div><\/div><div class=\"lL\" style=\"max-width:100%;width:853px;margin:5px auto;\"><\/div><figcaption><\/figcaption><\/figure>\n\n<div class=\"vlp-link-container vlp-layout-spotlight-clone wp-block-visual-link-preview-link\"><a href=\"https:\/\/research.trychroma.com\/evaluating-chunking\" class=\"vlp-link\" title=\"Chroma \u7814\u7a76\u6280\u8853\u5831\u544a\" rel=\"nofollow\" target=\"_blank\"><\/a><div class=\"vlp-layout-zone-main\"><span class=\"vlp-block-0 vlp-link-title\">Chroma \u7814\u7a76\u6280\u8853\u5831\u544a<\/span><div class=\"vlp-block-1 vlp-link-summary\">\u7814\u7a76\u5718\u968a\u63d0\u51fa\u4e86\u4e00\u5957\u65b0\u7684\u8a55\u4f30\u65b9\u6cd5\uff0c\u4ee5\u8a5e\u5f59\u5c64\u7d1a\u7684\u76f8\u95dc\u6027\u4f5c\u70ba\u8a55\u4f30\u6a19\u6e96\uff0c\u4e26\u6bd4\u8f03\u4e86\u5e7e\u7a2e\u5e38\u898b\u7684\u5206\u5272\u7b56\u7565\uff0c\u5305\u542b RecursiveCharacterTextSplitter \u4ee5\u53ca\u5718\u968a\u63d0\u51fa\u7684\u5169\u7a2e\u65b0\u65b9\u6cd5\uff1aClusterSemanticChunker \u548c LLMSemanticChunker\u3002\u5be6\u9a57\u7d50\u679c\u986f\u793a\uff0c\u4e0d\u540c\u7684\u5206\u5272\u7b56\u7565\u5c0d\u6aa2\u7d22\u6e96\u78ba\u7387\u548c\u6548\u7387\u6709\u986f\u8457\u5f71\u97ff\uff0c\u67d0\u4e9b\u7b56\u7565\u7684\u53ec\u56de\u7387\u751a\u81f3\u63d0\u5347\u4e86 9%\u3002<\/div><div class=\"vlp-block-2 vlp-link-image\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"493\" src=\"https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-1024x493.png\" class=\"attachment-large size-large not-transparent\" alt=\"\" srcset=\"https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-1024x493.png 1024w, https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-300x144.png 300w, https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-768x370.png 768w, https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-1536x739.png 1536w, https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-2048x986.png 2048w, https:\/\/infernews.com\/blog\/wp-content\/uploads\/2024\/12\/card-640x308.png 640w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" data-has-transparency=\"false\" data-dominant-color=\"afbae4\" style=\"max-width: 1024px;--dominant-color: #afbae4;\" \/><\/div><\/div><\/div>","protected":false},"excerpt":{"rendered":"<p>\u6587\u4ef6\u5206\u5272\u7b56\u7565\u5c0d\u65bc\u5927\u578b\u8a9e\u8a00\u6a21\u578b(LLM)\u7684\u8cc7\u8a0a\u6aa2\u7d22\u6548\u80fd\u5f71\u97ff\u3002\u73fe\u6709\u8cc7\u8a0a\u6aa2\u7d22\u57fa\u6e96\u6e2c\u8a66\u901a\u5e38\u4ee5\u6574\u7bc7\u6587\u4ef6\u7684\u76f8\u95dc\u6027\u8a55\u4f30\u6548\u80fd\uff0c\u5ffd\u7565\u4e86\u6587\u4ef6\u5206\u5272\u7b56\u7565\u7684\u91cd\u8981\u6027\u3002RAG \u6587\u4ef6\u5206\u5272(Text Chunking) \u57fa\u65bc\u5b57\u5143\/\u8a5e(Token) \u7684\u5206\u5272\u3001\u905e\u8ff4\u5f0f\u5206\u5272\u3001\u8a9e\u7fa9\u5206\u5272\u7b49\uff0c\u4ee5\u627e\u51fa\u6700\u9069\u5408\u7528\u65bc\u5f8c\u7e8c\u57fa\u65bc\u6aa2\u7d22\u589e\u5f37\u751f\u6210\uff08RAG\uff09\u61c9\u7528\u7684\u6700\u4f73\u65b9\u6cd5\u3002<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"googlesitekit_rrm_CAowvqSiDA:productID":"","footnotes":""},"categories":[131,109,27],"tags":[],"class_list":["post-3786","post","type-post","status-publish","format-standard","hentry","category-embedding","category-rag","category-paper"],"_links":{"self":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/posts\/3786","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/comments?post=3786"}],"version-history":[{"count":0,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/posts\/3786\/revisions"}],"wp:attachment":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/media?parent=3786"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/categories?post=3786"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/tags?post=3786"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}