他的回复:
配置文件为:seed: 0output_dir: './output' # path to save checkpoint/strategyload_checkpoint: '/home/ma-user/work/workspace/llama2_7b.ckpt'src_strategy_path_or_dir: ''auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed modelonly_save_strategy: Falseresume_training: Falserun_mode: 'predict'# trainer configtrainer: type: CausalLanguageModelingTrainer model_name: 'llama2_7b'# runner configrunner_config: epochs: 2 batch_size: 1 sink_mode: True sink_size: 2 gradient_accumulation_steps: 8# optimizeroptimizer: type: FP32StateAdamWeightDecay beta1: 0.9 beta2: 0.95 eps: 1.e-8 learning_rate: 5.e-5# lr schedulelr_schedule: type: CosineWithWarmUpLR learning_rate: 5.e-5 lr_end: 0 warmup_ratio: 0.03 total_steps: -1 # -1 means it will load the total steps of the dataset# datasettrain_dataset: &train_dataset data_loader: type: MindDataset dataset_dir: "" shuffle: True input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. num_parallel_workers: 1 python_multiprocessing: False drop_remainder: True batch_size: 1 repeat: 1 numa_enable: False prefetch_size: 1train_dataset_task: type: CausalLanguageModelDataset dataset_config: *train_dataset# if True, do evaluate during the training process. if false, do nothing.# note that the task trainer should support _evaluate_in_training function.do_eval: False# eval dataseteval_dataset: &eval_dataset data_loader: type: MindDataset dataset_dir: "" shuffle: False input_columns: ["input_ids"] num_parallel_workers: 8 python_multiprocessing: False drop_remainder: False repeat: 1 numa_enable: False prefetch_size: 1eval_dataset_task: type: CausalLanguageModelDataset dataset_config: *eval_datasetuse_parallel: False# parallel context configparallel: parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel gradients_mean: False enable_alltoall: False full_batch: True search_mode: "sharding_propagation" enable_parallel_optimizer: False strategy_ckpt_save_file: "./ckpt_strategy.ckpt" parallel_optimizer_config: gradient_accumulation_shard: False parallel_optimizer_threshold: 64# default parallel of device num = 8 for Atlas 800T A2parallel_config: data_parallel: 1 model_parallel: 1 pipeline_stage: 1 use_seq_parallel: False micro_batch_num: 1 vocab_emb_dp: True gradient_aggregation_group: 4# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.micro_batch_interleave_num: 1# recompute configrecompute_config: recompute: False select_recompute: False parallel_optimizer_comm_recompute: False mp_comm_recompute: True recompute_slice_activation: True# callbackscallbacks: - type: MFLossMonitor - type: CheckpointMonitor prefix: "llama2_7b" save_checkpoint_steps: 100 integrated_save: False async_save: False - type: ObsMonitor# mindspore context init configcontext: mode: 0 #0--Graph Mode; 1--Pynative Mode device_target: "Ascend" enable_graph_kernel: False max_call_depth: 10000 max_device_memory: "28GB" save_graphs: False save_graphs_path: "./graph" device_id: 0# model configmodel: model_config: type: LlamaConfig batch_size: 1 # add for increase predict seq_length: 4096 hidden_size: 4096 num_layers: 32 num_heads: 32 vocab_size: 32000 multiple_of: 256 rms_norm_eps: 1.0e-5 bos_token_id: 1 eos_token_id: 2 pad_token_id: 0 ignore_token_id: -100 compute_dtype: "float16" layernorm_compute_type: "float32" softmax_compute_type: "float32" rotary_dtype: "float16" param_init_type: "float16" use_past: True scaling_factor: 1.0 # The scale factor of seq length extend_method: "None" # support "None", "PI", "NTK" use_flash_attention: True # FA can accelerate training or finetune block_size: 16 num_blocks: 1024 is_dynamic: True qkv_concat: False offset: 0 checkpoint_name_or_path: "llama2_7b" repetition_penalty: 1 max_decode_length: 512 top_k: 3 top_p: 1 do_sample: False arch: type: LlamaForCausalLMprocessor: return_tensors: ms tokenizer: unk_token: '' bos_token: '' eos_token: '' pad_token: '' type: LlamaTokenizer type: LlamaProcessor# metricmetric: type: EmF1Metric# wrapper cell configrunner_wrapper: type: MFTrainOneStepCell scale_sense: type: DynamicLossScaleUpdateCell loss_scale_value: 65536 scale_factor: 2 scale_window: 1000 use_clip_grad: Trueeval_callbacks: - type: ObsMonitorauto_tune: Falsefilepath_prefix: './autotune'autotune_per_step: 10profile: Falseprofile_start_step: 1profile_stop_step: 10init_start_profile: Falseprofile_communication: Falseprofile_memory: Truelayer_scale: Falselayer_decay: 0.65lr_scale_factor: 256# aiccremote_save_url: "Please input obs url on AICC platform."