{"id":3323,"date":"2024-09-04T06:56:22","date_gmt":"2024-09-03T22:56:22","guid":{"rendered":"https:\/\/infernews.com\/?page_id=3323"},"modified":"2024-09-04T06:56:22","modified_gmt":"2024-09-03T22:56:22","slug":"%e5%be%ae%e8%aa%bf-llama-3-%e7%9a%84%e4%b8%ad%e6%96%87-text-embedding","status":"publish","type":"page","link":"https:\/\/infernews.com\/blog\/%e5%be%ae%e8%aa%bf-llama-3-%e7%9a%84%e4%b8%ad%e6%96%87-text-embedding\/","title":{"rendered":"\u5fae\u8abf Llama 3 \u7684\u4e2d\u6587 Text Embedding"},"content":{"rendered":"\n<p>Llama \u6a21\u578b\u53ef\u4ee5\u7528\u65bc Text Embedding\uff0c\u5fae\u8abf\u5b83\u4f86\u9069\u7528\u65bc\u4e0d\u540c\u7684\u4efb\u52d9\u662f\u4e00\u500b\u5e38\u898b\u7684 approach\u3002<\/p>\n\n\n\n<p>\u4e0b\u9762\u662f\u4e00\u500b\u57fa\u672c\u793a\u4f8bPython\u8173\u672c\uff0c\u53ef\u4ee5\u4f7f\u7528 pytorch \u548c transformers \u5eab\u4f86\u5fae\u8abf\u4e00\u500b\u5df2\u7d93\u9810\u8a13\u7df4\u7684\u4e2d\u6587\u8a9e\u7fa9\u5d4c\u5165\u6a21\u578b\uff08\u6bd4\u5982BERT\u6216ALBERT\uff09\uff0c\u5c07\u5176\u61c9\u7528\u65bc\u7279\u5b9a\u7684 Text Embedding \u4efb\u52d9\u4e2d\uff1a<\/p>\n\n\n\n<p>import pandas as pd<\/p>\n\n\n\n<p>from sklearn.metrics.pairwise import cosine_similarity &nbsp;<\/p>\n\n\n\n<p>from torch.utils.data import Dataset, DataLoader &nbsp;<\/p>\n\n\n\n<p>from transformers import BertTokenizerFast, AutoConfig &nbsp;<\/p>\n\n\n\n<p><em># <\/em>\u8a2d\u7f6e\u53c3\u6578\u548c\u8d85\u53c3\u6578<\/p>\n\n\n\n<p>epochs &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = 5&nbsp; &nbsp;<\/p>\n\n\n\n<p>batch_size &nbsp; &nbsp; &nbsp; = 64&nbsp; &nbsp;<\/p>\n\n\n\n<p>device &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = &#8216;gpu&#8217; if torch.cuda.is_available() else &#8216;cpu&#8217;<\/p>\n\n\n\n<p><em># <\/em>\u52a0\u8f09\u9810\u8a13\u7df4\u6a21\u578b\uff0c\u4e26\u8a2d\u7f6e\u5fae\u8abf\u8d85\u53c3\u6578<\/p>\n\n\n\n<p>tokenizer&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = BertTokenizerFast.from_pretrained(&#8220;huawei-noah-aliexpress\/bert-base-chinese&#8221;) &nbsp;<\/p>\n\n\n\n<p>config&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = AutoConfig.from_pretrained(&#8220;huawei-noah-aliexpress\/bert-base-chinese&#8221;)<\/p>\n\n\n\n<p>model &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = BertModel(config)<\/p>\n\n\n\n<p>optimizer &nbsp; &nbsp; &nbsp; &nbsp; = Adam(model.parameters(), lr=1e-5)<\/p>\n\n\n\n<p><em># <\/em>\u5275\u5efa\u81ea\u5b9a\u7fa9 <em>Dataset<\/em> \u985e\uff0c\u7528\u4f86\u52a0\u8f09\u3001\u9810\u8655\u7406\u8a13\u7df4\u6587\u672c\u6578\u64da<\/p>\n\n\n\n<p>class <strong>TextData<\/strong>(Dataset):<\/p>\n\n\n\n<p>&nbsp; &nbsp; def <strong>__init__<\/strong>(self, data_path, tokenizer, max_length):<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; self.data_path&nbsp; &nbsp; &nbsp; = data_path &nbsp;<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; self.tokenizer &nbsp; &nbsp; = tokenizer<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; self.max_length&nbsp; &nbsp; = max_length<\/p>\n\n\n\n<p>&nbsp; &nbsp; def <strong>process<\/strong>(self):<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; <em># <\/em>\u52a0\u8f09\u8a13\u7df4\u6587\u672c\u3001\u6a19\u7c64\u6578\u64da\uff0c\u4e26\u4f7f\u7528<em>tokenizer<\/em>\u5c07\u5176\u8f49\u63db\u70ba<em>tokenids<\/em>\u548c<em>attention_mask<\/em>\u5f62\u5f0f\u3002<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; data&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = pd.read_csv(self.data_path)<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; tokens&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = self.tokenizer(data[&#8216;text&#8217;], return_tensors=&#8217;pt&#8217;, padding=True,<\/p>\n\n\n\n<p>&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; truncation=True,max_length=self.max_length)<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; return tokens &nbsp;<\/p>\n\n\n\n<p>data_set &nbsp; &nbsp; &nbsp; = TextData(&#8220;your_data.csv&#8221;, tokenizer, max_length=128)<\/p>\n\n\n\n<p>train_loader &nbsp; = DataLoader(dataset=data_set,batch_size=batch_size,collate_fn=lambda batch:[sample tensors])<\/p>\n\n\n\n<p><em># <\/em>\u5b9a\u7fa9<em>training loop<\/em><\/p>\n\n\n\n<p>for epoch in range(epochs):<\/p>\n\n\n\n<p>&nbsp; &nbsp; model.train()<\/p>\n\n\n\n<p>&nbsp; &nbsp; total_loss = 0 &nbsp;<\/p>\n\n\n\n<p>&nbsp; &nbsp; count = 0<\/p>\n\n\n\n<p>&nbsp; &nbsp; for batch in train_loader:<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; input_ids&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = batch[&#8216;input_ids&#8217;].to(device)<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; attention_mask &nbsp; &nbsp; = batch[&#8216;attention_mask&#8217;].to(device)<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; optimizer.zero_grad() &nbsp;<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; outputs &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = model(input_ids, attention_mask=attention_mask) &nbsp;<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; loss&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = outputs.loss<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; total_loss&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; += loss.cpu().item()<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; count &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; += len(loss)<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; loss.backward()<\/p>\n\n\n\n<p>&nbsp; &nbsp; &nbsp; &nbsp; optimizer.step()<\/p>\n\n\n\n<p>&nbsp; &nbsp; train_accuracy &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = torch.sum(torch.round(torch.sigmoid(outputs)).cpu())\/outputs.shape[0]<\/p>\n\n\n\n<p>&nbsp; &nbsp; print(f&#8217;Epoch {epoch}, Training Loss: {total_loss\/count:.4f}&#8217;)<\/p>\n\n\n\n<p>model.eval()<\/p>\n\n\n\n<p><em># <\/em>\u4f7f\u7528\u5fae\u8abf\u6a21\u578b\u9032\u884c\u6587\u672c\u5d4c\u5165\u8a08\u7b97<\/p>\n\n\n\n<p>def <strong>get_embedding<\/strong>(text):<\/p>\n\n\n\n<p>&nbsp; &nbsp; tokens_id&nbsp; &nbsp; &nbsp; &nbsp; = tokenizer.encode(text,max_length=max_length,truncation=True,padding=&#8217;max_length&#8217;)<\/p>\n\n\n\n<p>&nbsp; &nbsp; tokens&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = torch.tensor([tokens_id]).to(device)<\/p>\n\n\n\n<p>&nbsp; &nbsp; embeddings &nbsp; &nbsp; &nbsp; &nbsp; = model(tokens)[0].cpu().detach().numpy()<\/p>\n\n\n\n<p>&nbsp; &nbsp; return embeddings<\/p>\n\n\n\n<p>\u4f60\u53ef\u4ee5\u6839\u64da\u5be6\u969b\u9700\u6c42\u9032\u884c\u76f8\u61c9\u7684\u4fee\u6539\u548c\u6dfb\u52a0\u3002\u8acb\u6ce8\u610f\uff0c\u8a72\u8173\u672c\u50c5\u63d0\u4f9b\u4e00\u500b\u57fa\u672c\u793a\u4f8b\uff0c\u60a8\u53ef\u80fd\u9700\u8981\u5728\u904b\u884c\u524d\u8abf\u6574\u8d85\u53c3\u6578\u3001\u6578\u64da\u52a0\u8f09\u7b49\u7d30\u7bc0\uff0c\u4ee5\u78ba\u4fdd\u5176\u80fd\u5920\u6eff\u8db3\u60a8\u7684\u7279\u5b9a\u8981\u6c42\u3002<\/p>\n\n\n\n<p>\u540c\u6642\uff0c\u5fae\u8abf\u6a21\u578b\u5c0d\u65bc\u6587\u672c\u5d4c\u5165\u4efb\u52d9\u4f86\u8aaa\u662f\u6709\u6548\u7684\u65b9\u6cd5\uff0c\u4f46\u60a8\u4e5f\u53ef\u4ee5\u5617\u8a66\u5176\u4ed6\u65b9\u6cd5\u548c\u6280\u8853\u4f86\u89e3\u6c7a\u9019\u4e00\u554f\u984c\uff0c\u4f8b\u5982\u4f7f\u7528\u9810\u8a13\u7df4\u6a21\u578b\u7684\u8f38\u51fa\u4f5c\u70ba\u6587\u672c\u5411\u91cf\u8868\u793a\u76f4\u63a5\u8f38\u5165\u5230\u4e0b\u6e38\u6a21\u578b\u3002\u6700\u7d42\u9078\u64c7\u54ea\u7a2e\u65b9\u6cd5\u53d6\u6c7a\u65bc\u60a8\u7684\u5177\u9ad4\u61c9\u7528\u5834\u666f\u53ca\u9700\u6c42\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Llama \u6a21\u578b\u53ef\u4ee5\u7528\u65bc Text Embedding\uff0c\u5fae\u8abf\u5b83\u4f86\u9069\u7528\u65bc\u4e0d\u540c\u7684\u4efb\u52d9\u662f\u4e00\u500b\u5e38\u898b\u7684 approach\u3002 \u4e0b\u9762\u662f\u4e00\u500b\u57fa\u672c\u793a\u4f8bPython\u8173\u672c\uff0c\u53ef\u4ee5\u4f7f\u7528 pytorch \u548c transformers \u5eab\u4f86\u5fae\u8abf\u4e00\u500b\u5df2\u7d93\u9810\u8a13\u7df4\u7684\u4e2d\u6587\u8a9e\u7fa9\u5d4c\u5165\u6a21\u578b\uff08\u6bd4\u5982BERT\u6216ALBERT\uff09\uff0c\u5c07\u5176\u61c9\u7528\u65bc\u7279\u5b9a\u7684 Text Embedding \u4efb\u52d9\u4e2d\uff1a import pandas as pd from sklearn.metrics.pairwise import cosine_similarity &nbsp; from torch.utils.data import Dataset, DataLoader &nbsp; from transformers import BertTokenizerFast, AutoConfig &nbsp; # \u8a2d\u7f6e\u53c3\u6578\u548c\u8d85\u53c3\u6578 epochs &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; = 5&nbsp; &nbsp; batch_size &nbsp; &nbsp; &nbsp; = 64&nbsp; &nbsp; device &nbsp; &nbsp; [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"parent":0,"menu_order":0,"comment_status":"closed","ping_status":"closed","template":"","meta":{"googlesitekit_rrm_CAowvqSiDA:productID":"","footnotes":""},"class_list":["post-3323","page","type-page","status-publish","hentry"],"_links":{"self":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages\/3323","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages"}],"about":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/types\/page"}],"author":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/comments?post=3323"}],"version-history":[{"count":0,"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/pages\/3323\/revisions"}],"wp:attachment":[{"href":"https:\/\/infernews.com\/blog\/wp-json\/wp\/v2\/media?parent=3323"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}