{
  "history": [
    {
      "loss": 7.450580596923828e-09,
      "grad_norm": 0.8439565300941467,
      "learning_rate": 2.5e-06,
      "num_tokens": 7984.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.5021741837263107,
      "rewards/grpo_reward_fn/std": 0.7405239939689636,
      "reward": -0.5021741837263107,
      "reward_std": 0.7405239939689636,
      "frac_reward_zero_std": 0.25,
      "entropy": 1.7798368036746979,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.351858943500133,
      "epoch": 0.05,
      "step": 2
    },
    {
      "loss": -1.1175870895385742e-08,
      "grad_norm": 0.664717435836792,
      "learning_rate": 7.500000000000001e-06,
      "num_tokens": 16448.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.48739828169345856,
      "rewards/grpo_reward_fn/std": 0.7173754274845123,
      "reward": -0.48739828169345856,
      "reward_std": 0.7173753678798676,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.654908001422882,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.101864065999848,
      "epoch": 0.1,
      "step": 4
    },
    {
      "loss": 1.1175870895385742e-08,
      "grad_norm": 0.5121914148330688,
      "learning_rate": 9.722222222222223e-06,
      "num_tokens": 24912.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.6889523267745972,
      "rewards/grpo_reward_fn/std": 0.6593092978000641,
      "reward": -0.6889523267745972,
      "reward_std": 0.6593093276023865,
      "frac_reward_zero_std": 0.5,
      "entropy": 2.1314686238765717,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.089144609999835,
      "epoch": 0.15,
      "step": 6
    },
    {
      "loss": 7.450580596923828e-09,
      "grad_norm": 0.5797783136367798,
      "learning_rate": 9.166666666666666e-06,
      "num_tokens": 33136.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.5775661319494247,
      "rewards/grpo_reward_fn/std": 0.6775152683258057,
      "reward": -0.5775661319494247,
      "reward_std": 0.6775152385234833,
      "frac_reward_zero_std": 0.25,
      "entropy": 2.303558796644211,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.272966438000026,
      "epoch": 0.2,
      "step": 8
    },
    {
      "loss": -1.192092824453539e-08,
      "grad_norm": 0.7534540295600891,
      "learning_rate": 8.611111111111112e-06,
      "num_tokens": 41360.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.7124589681625366,
      "rewards/grpo_reward_fn/std": 0.532426506280899,
      "reward": -0.7124589681625366,
      "reward_std": 0.5324264764785767,
      "frac_reward_zero_std": 0.0,
      "entropy": 2.495544731616974,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.314620568999999,
      "epoch": 0.25,
      "step": 10
    },
    {
      "loss": 7.450580596923828e-09,
      "grad_norm": 0.5361798405647278,
      "learning_rate": 8.055555555555557e-06,
      "num_tokens": 49344.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.6028223484754562,
      "rewards/grpo_reward_fn/std": 0.635626494884491,
      "reward": -0.6028223484754562,
      "reward_std": 0.6356264650821686,
      "frac_reward_zero_std": 0.5,
      "entropy": 1.98321932554245,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.410889981499963,
      "epoch": 0.3,
      "step": 12
    },
    {
      "loss": -7.344143959642224e-09,
      "grad_norm": 0.7255711555480957,
      "learning_rate": 7.500000000000001e-06,
      "num_tokens": 57568.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.29520659148693085,
      "rewards/grpo_reward_fn/std": 0.8585895001888275,
      "reward": -0.29520659148693085,
      "reward_std": 0.8585895001888275,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.692120999097824,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.303587671999935,
      "epoch": 0.35,
      "step": 14
    },
    {
      "loss": 2.554484845873617e-09,
      "grad_norm": 0.776721179485321,
      "learning_rate": 6.944444444444445e-06,
      "num_tokens": 65552.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.4467405825853348,
      "rewards/grpo_reward_fn/std": 0.8205806314945221,
      "reward": -0.4467405825853348,
      "reward_std": 0.8205806016921997,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.7413092851638794,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 10.958430156499958,
      "epoch": 0.4,
      "step": 16
    },
    {
      "loss": -3.725290298461914e-09,
      "grad_norm": 0.4659952223300934,
      "learning_rate": 6.3888888888888885e-06,
      "num_tokens": 73776.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.785138338804245,
      "rewards/grpo_reward_fn/std": 0.6077205836772919,
      "reward": -0.785138338804245,
      "reward_std": 0.6077205836772919,
      "frac_reward_zero_std": 0.5,
      "entropy": 2.087699919939041,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.285804093499905,
      "epoch": 0.45,
      "step": 18
    },
    {
      "loss": 1.30385160446167e-08,
      "grad_norm": 0.7443047165870667,
      "learning_rate": 5.833333333333334e-06,
      "num_tokens": 81760.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.4372890442609787,
      "rewards/grpo_reward_fn/std": 0.766058087348938,
      "reward": -0.4372890442609787,
      "reward_std": 0.7660580277442932,
      "frac_reward_zero_std": 0.25,
      "entropy": 1.7085065245628357,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 10.8905601969999,
      "epoch": 0.5,
      "step": 20
    },
    {
      "loss": 0.0,
      "grad_norm": 0.8972760438919067,
      "learning_rate": 5.2777777777777785e-06,
      "num_tokens": 89744.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.30172646045684814,
      "rewards/grpo_reward_fn/std": 0.7638400793075562,
      "reward": -0.30172646045684814,
      "reward_std": 0.7638400793075562,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.6857776045799255,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.429606533499964,
      "epoch": 0.55,
      "step": 22
    },
    {
      "loss": 1.3562384992837906e-08,
      "grad_norm": 0.6416975259780884,
      "learning_rate": 4.722222222222222e-06,
      "num_tokens": 97968.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.41030485928058624,
      "rewards/grpo_reward_fn/std": 0.6497728824615479,
      "reward": -0.41030485928058624,
      "reward_std": 0.6497728526592255,
      "frac_reward_zero_std": 0.25,
      "entropy": 1.7449655532836914,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.289256847499814,
      "epoch": 0.6,
      "step": 24
    },
    {
      "loss": 1.30385160446167e-08,
      "grad_norm": 0.5383069515228271,
      "learning_rate": 4.166666666666667e-06,
      "num_tokens": 106192.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.8011916875839233,
      "rewards/grpo_reward_fn/std": 0.5623147785663605,
      "reward": -0.8011916875839233,
      "reward_std": 0.5623147785663605,
      "frac_reward_zero_std": 0.5,
      "entropy": 2.0510507822036743,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.259985542499862,
      "epoch": 0.65,
      "step": 26
    },
    {
      "loss": 7.450580596923828e-09,
      "grad_norm": 0.5758256316184998,
      "learning_rate": 3.6111111111111115e-06,
      "num_tokens": 114176.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.7853103280067444,
      "rewards/grpo_reward_fn/std": 0.6072340309619904,
      "reward": -0.7853103280067444,
      "reward_std": 0.6072340309619904,
      "frac_reward_zero_std": 0.5,
      "entropy": 2.078708440065384,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 10.897705246999749,
      "epoch": 0.7,
      "step": 28
    },
    {
      "loss": 0.0,
      "grad_norm": 0.8784902691841125,
      "learning_rate": 3.055555555555556e-06,
      "num_tokens": 122640.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.5234325528144836,
      "rewards/grpo_reward_fn/std": 0.7491043210029602,
      "reward": -0.5234325528144836,
      "reward_std": 0.7491043210029602,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.8970149159431458,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.184859331999974,
      "epoch": 0.75,
      "step": 30
    },
    {
      "loss": 7.450580596923828e-09,
      "grad_norm": 0.7091466784477234,
      "learning_rate": 2.5e-06,
      "num_tokens": 130864.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.4050380438566208,
      "rewards/grpo_reward_fn/std": 0.8165028691291809,
      "reward": -0.4050380438566208,
      "reward_std": 0.8165028393268585,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.5603494346141815,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.331761374000052,
      "epoch": 0.8,
      "step": 32
    },
    {
      "loss": -1.862645149230957e-08,
      "grad_norm": 0.575290858745575,
      "learning_rate": 1.944444444444445e-06,
      "num_tokens": 139088.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.6410945951938629,
      "rewards/grpo_reward_fn/std": 0.6226174235343933,
      "reward": -0.6410945951938629,
      "reward_std": 0.6226174235343933,
      "frac_reward_zero_std": 0.25,
      "entropy": 2.1338054835796356,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.323922803500068,
      "epoch": 0.85,
      "step": 34
    },
    {
      "loss": 0.0,
      "grad_norm": 0.9170640707015991,
      "learning_rate": 1.3888888888888892e-06,
      "num_tokens": 147552.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.47176720201969147,
      "rewards/grpo_reward_fn/std": 0.6907638609409332,
      "reward": -0.47176720201969147,
      "reward_std": 0.6907638609409332,
      "frac_reward_zero_std": 0.0,
      "entropy": 1.7782395780086517,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.178734629500127,
      "epoch": 0.9,
      "step": 36
    },
    {
      "loss": -7.450580596923828e-09,
      "grad_norm": 0.6660693287849426,
      "learning_rate": 8.333333333333333e-07,
      "num_tokens": 155776.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.7778551578521729,
      "rewards/grpo_reward_fn/std": 0.6283205449581146,
      "reward": -0.7778551578521729,
      "reward_std": 0.6283205449581146,
      "frac_reward_zero_std": 0.5,
      "entropy": 2.0920713543891907,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.27789309100001,
      "epoch": 0.95,
      "step": 38
    },
    {
      "loss": -1.30385160446167e-08,
      "grad_norm": 0.7441156506538391,
      "learning_rate": 2.7777777777777776e-07,
      "num_tokens": 164000.0,
      "completions/mean_length": 140.0,
      "completions/min_length": 140.0,
      "completions/max_length": 140.0,
      "completions/clipped_ratio": 1.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_terminated_length": 0.0,
      "completions/max_terminated_length": 0.0,
      "rewards/grpo_reward_fn/mean": -0.6006919145584106,
      "rewards/grpo_reward_fn/std": 0.6558198630809784,
      "reward": -0.6006919145584106,
      "reward_std": 0.655819833278656,
      "frac_reward_zero_std": 0.0,
      "entropy": 2.056402266025543,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/high_max": 0.0,
      "clip_ratio/region_mean": 0.0,
      "step_time": 11.313541456000053,
      "epoch": 1.0,
      "step": 40
    },
    {
      "train_runtime": 469.1217,
      "train_samples_per_second": 0.682,
      "train_steps_per_second": 0.085,
      "total_flos": 0.0,
      "train_loss": 8.670447138037218e-10,
      "epoch": 1.0,
      "step": 40
    }
  ],
  "config": {
    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
    "n_sft_examples": 120,
    "sft_epochs": 1,
    "n_grpo_prompts": 80,
    "grpo_steps": 40,
    "train_mode": "short"
  }
}