{"id":5718,"date":"2025-07-06T17:22:07","date_gmt":"2025-07-06T09:22:07","guid":{"rendered":"http:\/\/xinblog.ltd\/?p=5718"},"modified":"2025-07-07T09:20:26","modified_gmt":"2025-07-07T01:20:26","slug":"%e6%a8%a1%e5%9e%8b%e5%be%ae%e8%b0%833-huggingface-%e5%be%ae%e8%b0%83","status":"publish","type":"post","link":"http:\/\/xinblog.ltd\/?p=5718","title":{"rendered":"\u6a21\u578b\u5fae\u8c033 Huggingface \u5fae\u8c03"},"content":{"rendered":"<p>\u6a21\u578b\u5fae\u8c033 \u4f7f\u7528Huggingface Transformers\u6765\u5fae\u8c03\u6a21\u578b<\/p>\n<p>\u5982\u679c\u6211\u4eec\u5e0c\u671b\u5fae\u8c03\u89d2\u8272\u626e\u6f14\u6a21\u578b\u7684\u8bdd\uff0c\u53ef\u4ee5\u8003\u8651\u4f7f\u7528Huggingface Transformers\u6765\u8fdb\u884c\u5fae\u8c03\u3002<\/p>\n<p>\u672c\u6b21\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528Huggingface Transformers \u4ee5\u53caMeta-Llama-3-8B-Instruct\u6a21\u578b\u8fdb\u884c\u5fae\u8c03\u3002<\/p>\n<p>\u90a3\u4e48\u4e3a\u4e86\u7b26\u5408\u6a21\u578b\u5fae\u8c03\u7684\u73af\u5883\uff0c\u6211\u4eec\u9996\u5148\u9700\u8981\u5b89\u88c5\u76f8\u5173\u4f9d\u8d56<\/p>\n<table>\n<tbody>\n<tr>\n<td>!pip install transformers==4.37.2<\/p>\n<p>!pip install pandas==1.5.3<\/p>\n<p>!pip install matplotlib==3.8.0<\/p>\n<p>!pip install numpy==1.26.4<\/p>\n<p>!pip install datasets==2.18.0<\/p>\n<p>!pip install jieba==0.42.1<\/p>\n<p>!pip install rouge-chinese==1.0.3<\/p>\n<p>!pip install tqdm==4.66.1<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u4e4b\u540e\u6211\u4eec\u5fae\u8c03\u4f1a\u4f7f\u7528SFT\u8fdb\u884c\u5927\u6a21\u578b\u7684\u5fae\u8c03\uff0cSFT\u4f1a\u4f7f\u5f97\u5927\u6a21\u578b\u66f4\u52a0\u9075\u5faa\u6307\u4ee4\uff0c\u98ce\u683c\u3002<\/p>\n<p>\u518d\u5176\u6b21\u9700\u8981\u51c6\u5907\u6a21\u578b\u5fae\u8c03\u4f7f\u7528\u5230\u7684\u6570\u636e\u96c6<\/p>\n<p>\u4e00\u822c\u6765\u8bf4\u90fd\u662f\u7b26\u5408Prompt+Response\u7684\u6570\u636e\u683c\u5f0f<\/p>\n<p>\u8fd9\u6b21\u6211\u4eec\u76f4\u63a5\u4f7f\u7528CharacterEval\u5f00\u6e90\u6570\u636e\u96c6\u8fdb\u884c\u5b9e\u9a8c\u3002\u5176\u4e2d\u5305\u542b\u4e86\u8be6\u7ec6\u7684\u4eba\u8bbe\u4fe1\u606f\uff0c\u4e3e\u4e2a\u793a\u4f8b<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" width=\"2403\" height=\"191\" class=\"wp-image-5719\" src=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7.png\" srcset=\"http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7.png 2403w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7-300x24.png 300w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7-1024x81.png 1024w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7-768x61.png 768w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7-1536x122.png 1536w, http:\/\/xinblog.ltd\/wp-content\/uploads\/2025\/06\/word-image-7-2048x163.png 2048w\" sizes=\"(max-width: 2403px) 100vw, 2403px\" \/><\/p>\n<p>\u63a5\u4e0b\u6765\u6211\u4eec\u9700\u8981\u4f7f\u7528Transformers\u5bf9Meta-Llama-3\u8fdb\u884c\u5168\u91cf\u66f4\u65b0<\/p>\n<p>\u9996\u5148\u5047\u8bbe\u6211\u4eec\u6709\u4e00\u4e2a\u6570\u636e\u96c6\uff0c\u5176\u4e2d\u542b\u67091000\u4e2a\u4e0a\u8ff0\u7684\u5bf9\u8bdd\uff0c\u5305\u542b\u4e86\u4e0d\u540c\u7684\u89d2\u8272\u548c\u5176\u4e2d\u7684\u5bf9\u8bdd\u3002<\/p>\n<p>\u9996\u5148\u6211\u4eec\u9700\u8981\u8fdb\u884c\u5212\u5206\uff0c\u5c06\u5176\u5212\u5206\u4e3a\u8bad\u7ec3\u96c6\u548c\u8bc4\u4f30\u96c6<\/p>\n<p>\u5176\u4e2d\u9075\u5faa\u4e09\u4e2a\u539f\u5219\uff0c<\/p>\n<p>1.\u9996\u5148\u662f\u8bc4\u4f30\u96c6\u7684\u6570\u636e\u548c\u8bad\u7ec3\u96c6\u7684\u6570\u636e\u6765\u6e90\u4e00\u81f4\uff0c\u5185\u5bb9\u4e0d\u80fd\u91cd\u590d<\/p>\n<p>2.\u5176\u6b21\u662f\u786e\u4fdd\u7ecf\u5e38\u4f7f\u7528\u7684\u4eba\u8bbe\u5bf9\u5e94\u7684\u8bad\u7ec3\u6570\u636e\u8981\u591a\uff0c\u65b9\u4fbf\u8bc4\u6d4b<\/p>\n<p>3.\u6700\u540e\u662f\u4fdd\u7559\u4e00\u4e9b\u6ca1\u6709\u5728\u6570\u636e\u4e2d\u6d89\u53ca\u7684\u4eba\u8bbe\uff0c\u7528\u6765\u8bc4\u6d4b\u5927\u6a21\u578b\u6cdb\u5316\u80fd\u529b\u3002<\/p>\n<p>\u56e0\u6b64\u6211\u4eec\u53ef\u4ee5\u5c06\u6570\u636e\u96c6\u6309\u7167\u4eba\u8bbe\u5206\u7ec4\uff0c<\/p>\n<p>\u7136\u540e\u53d610\u4e2a\u4eba\u8bbe\u5355\u72ec\u4f5c\u4e3a\u9a8c\u8bc1\u96c6\u4e2d\u7684\u6570\u636e\u786e\u4fdd\u6700\u540e\u4e00\u6761\u7b26\u5408<\/p>\n<p>\u9664\u6b64\u5916\uff0c\u5bf9\u4e8e\u5269\u4f59\u4e0b\u6765\u7684\u4eba\u8bbe\uff0c\u53d610%\u7684\u6570\u636e\u653e\u5165\u6d4b\u8bd5\u96c6\uff0c\u5269\u4e0b\u768490%\u52a0\u5165\u6570\u636e\u96c6<\/p>\n<table>\n<tbody>\n<tr>\n<td>import random<\/p>\n<p>import copy<\/p>\n<p>import math<\/p>\n<p>random.seed(42)<\/p>\n<p>all_data = copy.deepcopy(processed_data)<\/p>\n<p># \u6253\u4e71\u6570\u636e\u987a\u5e8f<\/p>\n<p>random.shuffle(all_data)<\/p>\n<p>all_data_df = pd.DataFrame(all_data)<\/p>\n<p>train_df_list = []<\/p>\n<p>eval_df_list = []<\/p>\n<p>role_name_list = list(role_to_id.keys())<\/p>\n<p># \u53d6\u6700\u540e10\u4e2a\u89d2\u8272\u5168\u90e8\u653e\u7f6e\u5728\u9a8c\u8bc1\u96c6\u4e2d<\/p>\n<p>for role_name in role_name_list[-10:]:<\/p>\n<p>role_df = all_data_df[all_data_df[&#8216;role&#8217;] == role_name]<\/p>\n<p>eval_df_list.append(role_df)<\/p>\n<p>for role_name in role_name_list[:-10]:<\/p>\n<p>role_df = all_data_df[all_data_df[&#8216;role&#8217;] == role_name]<\/p>\n<p># \u53d690%\u7684\u6570\u636e\u4f5c\u4e3a\u8bad\u7ec3\u96c6<\/p>\n<p>train_role_num = math.ceil(len(role_df) * 0.9)<\/p>\n<p>eval_role_num = len(role_df) &#8211; train_role_num<\/p>\n<p>train_role_df = role_df.iloc[:train_role_num]<\/p>\n<p>train_df_list.append(train_role_df)<\/p>\n<p>eval_role_df = role_df.iloc[train_role_num:]<\/p>\n<p>eval_df_list.append(eval_role_df)<\/p>\n<p>train_df = pd.concat(train_df_list)<\/p>\n<p>eval_df = pd.concat(eval_df_list)<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u4e4b\u540e\u5c31\u662f\u8bad\u7ec3\u64cd\u4f5c\u4e86<\/p>\n<p>\u5176\u4e2d\u6838\u5fc3\u662f\u8bad\u7ec3\u7684\u51fd\u6570<\/p>\n<table>\n<tbody>\n<tr>\n<td>def train(<\/p>\n<p># model\/data params<\/p>\n<p>base_model: str = &#8220;.\/model\/Meta-Llama-3-8B-Instruct&#8221;,<\/p>\n<p>max_seq_len: int = 8192,<\/p>\n<p>train_data_path: str = &#8220;.\/data\/train_ds.jsonl&#8221;,<\/p>\n<p>val_data_path: str = &#8220;.\/data\/eval_ds.jsonl&#8221;,<\/p>\n<p>output_dir: str = &#8220;.\/output\/role_model\/&#8221;,<\/p>\n<p>micro_batch_size: int = 8,<\/p>\n<p>gradient_accumulation_steps: int = 1,<\/p>\n<p>num_epochs: int = 3,<\/p>\n<p>learning_rate: float = 3e-5,<\/p>\n<p>val_set_size: int = 10,<\/p>\n<p>):<\/p>\n<p>device_map = &#8220;auto&#8221;<\/p>\n<p># \u52a0\u8f7d\u6a21\u578b<\/p>\n<p>model = AutoModelForCausalLM.from_pretrained(<\/p>\n<p>base_model,<\/p>\n<p>device_map=device_map,<\/p>\n<p>attn_implementation=&#8221;flash_attention_2&#8243;<\/p>\n<p>)<\/p>\n<p>tokenizer = AutoTokenizer.from_pretrained(<\/p>\n<p>base_model<\/p>\n<p>)<\/p>\n<p>tokenizer.pad_token_id = 0<\/p>\n<p>if train_data_path.endswith(&#8220;.json&#8221;) or train_data_path.endswith(&#8220;.jsonl&#8221;):<\/p>\n<p>train_ds = load_dataset(&#8220;json&#8221;, data_files=train_data_path)<\/p>\n<p>else:<\/p>\n<p>train_ds = load_dataset(train_data_path)<\/p>\n<p>if val_data_path is not None:<\/p>\n<p>if val_data_path.endswith(&#8220;.json&#8221;) or val_data_path.endswith(&#8220;.jsonl&#8221;):<\/p>\n<p>val_ds = load_dataset(&#8220;json&#8221;, data_files=val_data_path)<\/p>\n<p>else:<\/p>\n<p>val_ds = load_dataset(val_data_path)<\/p>\n<p>train_ds = train_ds[&#8220;train&#8221;]<\/p>\n<p>val_ds = val_ds[&#8220;train&#8221;]<\/p>\n<p>else:<\/p>\n<p>#split thte data to train\/val set<\/p>\n<p>train_val = trian_data[&#8220;train&#8221;].train_test_split(<\/p>\n<p>test_size=val_set_size, shuffle=False, seed=42<\/p>\n<p>)<\/p>\n<p>train_ds = train_val[&#8220;train&#8221;]<\/p>\n<p>val_ds = train_val[&#8220;test&#8221;]<\/p>\n<p>train_data = train_ds.map(preprocess,<\/p>\n<p>fn_kwargs={&#8216;tokenizer&#8217;: tokenizer, &#8216;max_length&#8217;: max_seq_len,<\/p>\n<p>&#8216;template&#8217;: LLAMA3_TEMPLATE},<\/p>\n<p>batched=True, num_proc=16,<\/p>\n<p>remove_columns=[&#8220;src&#8221;, &#8220;tgt&#8221;]<\/p>\n<p>)<\/p>\n<p>val_data = val_ds.map(preprocess,<\/p>\n<p>fn_kwargs={&#8216;tokenizer&#8217;: tokenizer, &#8216;max_length&#8217;: max_seq_len,<\/p>\n<p>&#8216;template&#8217;: LLAMA3_TEMPLATE},<\/p>\n<p>batched=True, num_proc=16,<\/p>\n<p>remove_columns=[&#8220;src&#8221;, &#8220;tgt&#8221;]<\/p>\n<p>)<\/p>\n<p>trainer = transformers.Trainer(<\/p>\n<p>model=model,<\/p>\n<p>train_dataset=train_data,<\/p>\n<p>eval_dataset=val_data,<\/p>\n<p>args=transformers.TrainingArguments(<\/p>\n<p>per_device_train_batch_size=micro_batch_size,<\/p>\n<p>gradient_accumulation_steps=gradient_accumulation_steps,<\/p>\n<p>warmup_steps=50,<\/p>\n<p>num_train_epochs=num_epochs,<\/p>\n<p>learning_rate=learning_rate,<\/p>\n<p>logging_steps=10,<\/p>\n<p>optim=&#8221;adamw_torch&#8221;,<\/p>\n<p>fp16=True,<\/p>\n<p>evaluation_strategy=&#8221;steps&#8221;,<\/p>\n<p>save_strategy=&#8221;steps&#8221;,<\/p>\n<p>eval_steps=500,<\/p>\n<p>save_steps=1000,<\/p>\n<p>output_dir=output_dir,<\/p>\n<p>save_total_limit=3<\/p>\n<p>),<\/p>\n<p>data_collator=transformers.DataCollatorForSeq2Seq(<\/p>\n<p>tokenizer, pad_to_multiple_of=8, return_tensors=&#8221;pt&#8221;, padding=True<\/p>\n<p>),<\/p>\n<p>)<\/p>\n<p>trainer.train()<\/p>\n<p>model.save_pretrained(output_dir, max_shard_size=&#8221;2GB&#8221;) # \u4fdd\u5b58\u6a21\u578b\uff0c\u4e14\u6700\u5927\u5206\u7247\u5927\u5c0f\u4e3a 2GB<\/p>\n<p>tokenizer.save_pretrained(output_dir) # \u4fdd\u5b58 tokenizer<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u9664\u6b64\u5916\u5c31\u662f\u6570\u636e\u9884\u5904\u7406\uff0c\u9700\u8981\u5904\u7406\u7684\u548c\u6a21\u578b\u4e4b\u524d\u8bad\u7ec3\u65f6\u4f7f\u7528\u7684\u4e00\u81f4<\/p>\n<table>\n<tbody>\n<tr>\n<td>from typing import List<\/p>\n<p>import copy<\/p>\n<p>import torch<\/p>\n<p>import transformers<\/p>\n<p>from datasets import load_dataset<\/p>\n<p>from transformers import AutoTokenizer, AutoModelForCausalLM<\/p>\n<p>LLAMA3_TEMPLATE = &#8220;&lt;|start_header_id|&gt;user&lt;|end_header_id|&gt;\\n\\n{}&lt;|eot_id|&gt;&lt;|start_header_id|&gt;assistant&lt;|end_header_id|&gt;\\n\\n&#8221;<\/p>\n<p>def preprocess(example, tokenizer, max_length, template=LLAMA3_TEMPLATE):<\/p>\n<p>&#8220;&#8221;&#8221;<\/p>\n<p>Preprocess a dialogue dataset into the format required by model.<\/p>\n<p>&#8220;&#8221;&#8221;<\/p>\n<p>srcs = example[&#8216;src&#8217;]<\/p>\n<p>tgts = example[&#8216;tgt&#8217;]<\/p>\n<p>result_inputs = []<\/p>\n<p>result_labels = []<\/p>\n<p>result_attention_mask = []<\/p>\n<p>for idx, (questions, answers) in enumerate(zip(srcs, tgts)):<\/p>\n<p>utterances = []<\/p>\n<p>for i, (src, tgt) in enumerate(zip(questions, answers)):<\/p>\n<p>if i == 0:<\/p>\n<p>utterances.append(&#8216;&lt;|begin_of_text|&gt;&#8217; + template.format(src))<\/p>\n<p>else:<\/p>\n<p>utterances.append(template.format(src))<\/p>\n<p>utterances.append(tgt)<\/p>\n<p>utterances_ids = tokenizer(utterances,<\/p>\n<p>add_special_tokens=False,<\/p>\n<p>max_length=max_length,<\/p>\n<p>truncation=True).input_ids<\/p>\n<p>input_ids = []<\/p>\n<p>label = [] # \u7528\u4e8e\u5bf9input\u8fdb\u884cmask\uff0c\u53ea\u8ba1\u7b97target\u90e8\u5206\u7684loss<\/p>\n<p>for i, utterances_id in enumerate(utterances_ids):<\/p>\n<p>if i % 2 == 0:<\/p>\n<p># instruction<\/p>\n<p>input_ids += utterances_id<\/p>\n<p># \u5bf9instruction\u5bf9\u5e94\u7684label\u8fdb\u884cmask<\/p>\n<p>label += ([-100] * len(utterances_id))<\/p>\n<p>else:<\/p>\n<p># response<\/p>\n<p>input_ids += (utterances_id + [tokenizer.convert_tokens_to_ids(&#8220;&lt;|eot_id|&gt;&#8221;)])<\/p>\n<p>label += (utterances_id + [tokenizer.convert_tokens_to_ids(&#8220;&lt;|eot_id|&gt;&#8221;)])<\/p>\n<p>assert len(input_ids) == len(label)<\/p>\n<p># \u5bf9\u957f\u5ea6\u8fdb\u884c\u622a\u65ad<\/p>\n<p>input_ids = input_ids[:max_length]<\/p>\n<p>label = label[:max_length]<\/p>\n<p>attention_mask = [1] * len(input_ids)<\/p>\n<p>result_inputs.append(input_ids)<\/p>\n<p>result_labels.append(label)<\/p>\n<p>result_attention_mask.append(attention_mask)<\/p>\n<p>return dict(input_ids=result_inputs, labels=result_labels, attention_mask=result_attention_mask)<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u6700\u540e\u8c03\u7528train\u51fd\u6570<\/p>\n<table>\n<tbody>\n<tr>\n<td>train(<\/p>\n<p># model\/data params<\/p>\n<p>base_model = &#8220;\/mnt\/cfs_bj\/big_model\/models\/meta-llama\/Meta-Llama-3-8B-Instruct\/&#8221;,<\/p>\n<p>train_data_path= &#8220;.\/data\/train_ds.jsonl&#8221;,<\/p>\n<p>val_data_path = &#8220;.\/data\/eval_ds.jsonl&#8221;,<\/p>\n<p>output_dir = &#8220;.\/output\/role_model\/&#8221;,<\/p>\n<p>micro_batch_size = 2,<\/p>\n<p>gradient_accumulation_steps = 1,<\/p>\n<p>num_epochs = 3,<\/p>\n<p>learning_rate = 3e-5<\/p>\n<p>)<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u8bad\u7ec3\u4e4b\u540e\u7684\u6a21\u578b\u9700\u8981\u8fdb\u884c\u4eba\u5de5\u8bc4\u6d4b\u548c\u81ea\u52a8\u5316\u8bc4\u6d4b\u4e24\u4e2a\u90e8\u5206\u3002<\/p>\n<p>\u5176\u4e2d\u4eba\u5de5\u8bc4\u6d4b\u7684\u90e8\u5206\uff0c\u91c7\u7528\u6253\u5206\u5236<\/p>\n<p>\u4ee54-0\u4e3a\u4e0d\u540c\u7684\u56de\u7b54\u6253\u5206\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u6a21\u578b\u5fae\u8c033 \u4f7f\u7528Hugging [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[49],"tags":[],"_links":{"self":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts\/5718"}],"collection":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=5718"}],"version-history":[{"count":1,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts\/5718\/revisions"}],"predecessor-version":[{"id":5720,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=\/wp\/v2\/posts\/5718\/revisions\/5720"}],"wp:attachment":[{"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=5718"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=5718"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/xinblog.ltd\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=5718"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}