How to shape the reward successfully

I try to use callbacks to shape the reward,and I found it does not work, it still use its original reward, did anyone one how to shape the reward?

def on_postprocess_trajectory(self, *, worker, episode, agent_id,
                              policy_id, policies, postprocessed_batch,
                              original_batches, **kwargs):
    rewards = postprocessed_batch["rewards"]
    episode_ids = postprocessed_batch["eps_id"]
    # print(f"[Before modification] rewards: {rewards}")

    # Step 1:
    all_past_rewards = []
    try:
        with open("reward.txt", "r") as f:
            for line in f:
                match = re.match(r"Iteration (\d+): (\[.*\])", line.strip())
                if match:
                    reward_list = ast.literal_eval(match.group(2))
                    all_past_rewards.extend(reward_list)
    except Exception as e:
        print(f"[Warning] Failed to read reward.txt: {e}")
        return

    if not all_past_rewards:
        print("[Info] No reference reward data found. Skipping modification.")
        return

    J_ref_mean = float(np.mean(all_past_rewards))
    if J_ref_mean == 0:
        print("[Warning] J_ref_mean is zero. Skipping modification.")
        return

    # Step 2:
    rewards = np.array(rewards, dtype=np.float32)
    episode_ids = np.array(episode_ids)
    modified_rewards = rewards.copy()

    unique_eps_ids = np.unique(episode_ids)
    for eps_id in unique_eps_ids:
        mask = episode_ids == eps_id
        traj_rewards = rewards[mask]

        J_pi_t = float(np.sum(traj_rewards))

        delta = self.alpha * (J_pi_t - J_ref_mean) / abs(J_ref_mean)
        bonus = self.lambd * delta

        modified_rewards[mask] = traj_rewards + bonus

        # print(f"[eps_id {eps_id}] J_pi_t = {J_pi_t:.4f}, bonus = {bonus:.4f}")

    # postprocessed_batch["rewards"] = modified_rewards
    postprocessed_batch["rewards"][:] = 0.0
    # print(f"[After modification]  rewards: {postprocessed_batch['rewards']}")

# def on_learn_on_batch(self, *, train_batch, **kwargs):
#     print(f"[Train] Mean of rewards used for training: {torch.mean(train_batch['rewards']):.2f}")

Even in the final, I set the # postprocessed_batch[“rewards”][:] = 0.0#, the policy still improving, this means I did not shape the reward successfully.

The trainig result is:
Iteration 0: reward_mean = -28.59308126702178
(RolloutWorker pid=722869) [Info] No reference reward data found. Skipping modification.
Iteration 1: reward_mean = -27.926609622752025
Iteration 2: reward_mean = -29.537007496950245
Iteration 3: reward_mean = -27.49214921991107
Iteration 4: reward_mean = -27.915504150158803
Iteration 5: reward_mean = -28.2339961136319
Iteration 6: reward_mean = -25.00310046638634
Iteration 7: reward_mean = -24.842911662136384
Iteration 8: reward_mean = -24.348385038314316
Iteration 9: reward_mean = -22.340533948573928
Iteration 10: reward_mean = -21.05639993285537
(RolloutWorker pid=722855) [Info] No reference reward data found. Skipping modification. [repeated 35x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see Configuring Logging — Ray 3.0.0.dev0 for more options.)
Iteration 11: reward_mean = -19.304802045934313
Iteration 12: reward_mean = -17.515987084203044
Iteration 13: reward_mean = -16.753565445431303
Iteration 14: reward_mean = -15.653722264630153
Iteration 15: reward_mean = -14.370444683592925
Iteration 16: reward_mean = -13.968583288635543
Iteration 17: reward_mean = -12.815635155937894
Iteration 18: reward_mean = -12.360415438712742
Iteration 19: reward_mean = -11.89852846687779
Iteration 20: reward_mean = -11.208693932521056
(RolloutWorker pid=722855) [Info] No reference reward data found. Skipping modification. [repeated 54x across cluster]
Iteration 21: reward_mean = -10.757852513625004
Iteration 22: reward_mean = -10.106308960674436
Iteration 23: reward_mean = -9.619814866014272
Iteration 24: reward_mean = -8.866933112119897
Iteration 25: reward_mean = -8.819288475415275
Iteration 26: reward_mean = -8.43527290887007
Iteration 27: reward_mean = -7.525015116340972
Iteration 28: reward_mean = -7.461048875628364
Iteration 29: reward_mean = -6.5297037278909285
Iteration 30: reward_mean = -5.533162890410698
(RolloutWorker pid=722846) [Info] No reference reward data found. Skipping modification. [repeated 33x across cluster]
Iteration 31: reward_mean = -6.069513212280274
Iteration 32: reward_mean = -5.706249136330277
Iteration 33: reward_mean = -3.8746266062432726
Iteration 34: reward_mean = -4.6182359208875114
Iteration 35: reward_mean = -4.384016437088916
Iteration 36: reward_mean = -3.5716920271568835
Iteration 37: reward_mean = -3.8652758131442773
Iteration 38: reward_mean = -5.003582194746292
Iteration 39: reward_mean = -3.4219876047281956
Iteration 40: reward_mean = -1.8751388627584236
(RolloutWorker pid=722848) [Info] No reference reward data found. Skipping modification. [repeated 7x across cluster]
Iteration 41: reward_mean = -0.7950476849319777
Iteration 42: reward_mean = -1.555348387638108
Iteration 43: reward_mean = -1.354055207816635
Iteration 44: reward_mean = -0.24345237055854707
Iteration 45: reward_mean = -1.0958554347711937
Iteration 46: reward_mean = 0.18766577630727774
Iteration 47: reward_mean = 0.5642680042687873
Iteration 48: reward_mean = -0.08101772324081245
Iteration 49: reward_mean = 1.2202637418313034
Iteration 50: reward_mean = -0.28758950434302516
Iteration 51: reward_mean = 0.5470823602958924
Iteration 52: reward_mean = -0.4438649575513892
Iteration 53: reward_mean = 2.377146069300398
Iteration 54: reward_mean = -0.1929521731442942
Iteration 55: reward_mean = 1.637198900800009
Iteration 56: reward_mean = 1.02371664211546
Iteration 57: reward_mean = 1.4231957488125913
Iteration 58: reward_mean = 2.275497129076649
Iteration 59: reward_mean = 2.792970046641468
Iteration 60: reward_mean = 3.842128587774733
(RolloutWorker pid=722852) [Info] No reference reward data found. Skipping modification. [repeated 6x across cluster]
Iteration 61: reward_mean = -0.7022179436315251
Iteration 62: reward_mean = 2.3560096030506914
Iteration 63: reward_mean = 2.256081429722187
Iteration 64: reward_mean = 1.4439557883771508
Iteration 65: reward_mean = 0.6293297937337625
Iteration 66: reward_mean = 0.4997833411974537
Iteration 67: reward_mean = 1.4176430113469414
Iteration 68: reward_mean = 1.8609281236601058
Iteration 69: reward_mean = 4.103538168098668
Iteration 70: reward_mean = 2.339563544396816
(RolloutWorker pid=722852) [Info] No reference reward data found. Skipping modification. [repeated 2x across cluster]
Iteration 71: reward_mean = 0.9877521549621272
Iteration 72: reward_mean = 3.243765788161568
Iteration 73: reward_mean = 2.4028598179603864
Iteration 74: reward_mean = 2.9098804144495882
Iteration 75: reward_mean = 3.215528964661193
Iteration 76: reward_mean = -1.042381351080854
Iteration 77: reward_mean = 2.424467093095826
Iteration 78: reward_mean = 0.5707936691443973
Iteration 79: reward_mean = 2.296924754256875
Iteration 80: reward_mean = 1.578144732941451
(RolloutWorker pid=722851) [Info] No reference reward data found. Skipping modification.
(RolloutWorker pid=722852) [Info] No reference reward data found. Skipping modification.
Iteration 81: reward_mean = 0.01747257978594883
Iteration 82: reward_mean = 4.544883811484569
Iteration 83: reward_mean = 1.1985802927800224
Iteration 84: reward_mean = 3.2973984861611423
Iteration 85: reward_mean = 0.6446376164648702
Iteration 86: reward_mean = 1.4203796209419415
Iteration 87: reward_mean = 3.9105373075198018
Iteration 88: reward_mean = 2.7968526715050994
Iteration 89: reward_mean = 4.040932954620365
Iteration 90: reward_mean = 3.22068927993913
(RolloutWorker pid=722853) [Info] No reference reward data found. Skipping modification. [repeated 6x across cluster]
Iteration 91: reward_mean = 4.145234518638268
Iteration 92: reward_mean = 7.58472311904776
Iteration 93: reward_mean = 4.337937836866415
Iteration 94: reward_mean = 4.076553309001055
Iteration 95: reward_mean = 1.6992229249306279
Iteration 96: reward_mean = 1.7470147681410557
Iteration 97: reward_mean = 6.172602048038321
Iteration 98: reward_mean = 1.9521460274745082
Iteration 99: reward_mean = 1.0714745127223244
Iteration 100: reward_mean = 5.153385356198487
Iteration 101: reward_mean = 3.2011577015792656
(RolloutWorker pid=722867) [Info] No reference reward data found. Skipping modification.
Iteration 102: reward_mean = 5.991574895157995
Iteration 103: reward_mean = 3.160550377151459
Iteration 104: reward_mean = 5.324873328562272
Iteration 105: reward_mean = 2.5793739597050096
Iteration 106: reward_mean = 5.276757015347875
Iteration 107: reward_mean = 2.508221840771249
Iteration 108: reward_mean = 4.370084070800627
Iteration 109: reward_mean = 5.23962658080848
Iteration 110: reward_mean = 3.729080336157907
(RolloutWorker pid=722851) [Info] No reference reward data found. Skipping modification.
Iteration 111: reward_mean = 2.955722782824903
Iteration 112: reward_mean = 2.6961123985882356
Iteration 113: reward_mean = 4.536369379424729
Iteration 114: reward_mean = 3.6160202112956044
Iteration 115: reward_mean = 2.6582817917342987
Iteration 116: reward_mean = 4.1788418012544275
Iteration 117: reward_mean = 3.2914624640747285
Iteration 118: reward_mean = 5.532188754982876
Iteration 119: reward_mean = 6.839514429871755
Iteration 120: reward_mean = 5.848417547720066
(RolloutWorker pid=722857) [Info] No reference reward data found. Skipping modification. [repeated 2x across cluster]
Iteration 121: reward_mean = 0.3381628109346052
Iteration 122: reward_mean = 5.197990399443436
Iteration 123: reward_mean = 5.331305949189792
Iteration 124: reward_mean = 2.5838410113340395
Iteration 125: reward_mean = 6.6019439297072084
Iteration 126: reward_mean = 7.12408468324147
Iteration 127: reward_mean = 3.0683394860221602
Iteration 128: reward_mean = 5.009549919107619
Iteration 129: reward_mean = 5.844002896209155
Iteration 130: reward_mean = 3.9827722329030313
(RolloutWorker pid=722861) [Info] No reference reward data found. Skipping modification. [repeated 3x across cluster]
Iteration 131: reward_mean = 4.181512042108795
Iteration 132: reward_mean = 2.891529260552943
Iteration 133: reward_mean = 3.5634339209310903
Iteration 134: reward_mean = 7.06385511951801
Iteration 135: reward_mean = 2.664522246149081
Iteration 136: reward_mean = 6.693395998046971
Iteration 137: reward_mean = 4.016957291217385
Iteration 138: reward_mean = 5.9706678728134746
Iteration 139: reward_mean = 4.856059904496798
Iteration 140: reward_mean = 4.974673744959895
Iteration 141: reward_mean = 5.741981968127034
Iteration 142: reward_mean = 6.319046975324793
Iteration 143: reward_mean = 5.404260184071049
Iteration 144: reward_mean = 3.600115945054591
Iteration 145: reward_mean = 1.9516382578296811
Iteration 146: reward_mean = 4.149458208954114
Iteration 147: reward_mean = 0.6284453539072578
Iteration 148: reward_mean = 1.5546430338886887
Iteration 149: reward_mean = 3.083551841633319
Iteration 150: reward_mean = 1.868334434832178
(RolloutWorker pid=722849) [Info] No reference reward data found. Skipping modification. [repeated 4x across cluster]
Iteration 151: reward_mean = 5.566475325013622
Iteration 152: reward_mean = 6.471757647173228
Iteration 153: reward_mean = 4.572245767757353
Iteration 154: reward_mean = 1.728834174686245
Iteration 155: reward_mean = 3.59882594623308
Iteration 156: reward_mean = 6.031075579394998
Iteration 157: reward_mean = 6.95777461083233
Iteration 158: reward_mean = 5.819124790825835
Iteration 159: reward_mean = 7.080013422972255
Iteration 160: reward_mean = 10.092479866240609
(RolloutWorker pid=722856) [Info] No reference reward data found. Skipping modification.
(RolloutWorker pid=722853) [Info] No reference reward data found. Skipping modification.
Iteration 161: reward_mean = 6.525535027772099
Iteration 162: reward_mean = 3.7293318369636865
Iteration 163: reward_mean = 6.13684024814144
Iteration 164: reward_mean = 8.103568370967395
Iteration 165: reward_mean = 8.560664604768352
Iteration 166: reward_mean = 4.746516670349473
Iteration 167: reward_mean = 5.435336726917868
Iteration 168: reward_mean = 8.637622519518766
Iteration 169: reward_mean = 9.821139135008536
Iteration 170: reward_mean = 9.361124193854442
Iteration 171: reward_mean = 9.006563833840401
Iteration 172: reward_mean = 11.207141387871893
Iteration 173: reward_mean = 11.930713754230569
Iteration 174: reward_mean = 9.377131728778195
Iteration 175: reward_mean = 3.554547692251173
Iteration 176: reward_mean = 7.361728843040722
Iteration 177: reward_mean = 2.2081067623468487
Iteration 178: reward_mean = 6.054919199738388
Iteration 179: reward_mean = 7.886125283060306
Iteration 180: reward_mean = 9.455831927955284
Iteration 181: reward_mean = 13.794002458618854
Iteration 182: reward_mean = 11.321435424431336
Iteration 183: reward_mean = 9.15110553137731
Iteration 184: reward_mean = 9.025523080735516
Iteration 185: reward_mean = 10.70021349588451
Iteration 186: reward_mean = 11.96041033509962
Iteration 187: reward_mean = 13.037157065445667
Iteration 188: reward_mean = 11.651250247455987
Iteration 189: reward_mean = 8.948148198725097
Iteration 190: reward_mean = 15.281994671136083
(RolloutWorker pid=722863) [Info] No reference reward data found. Skipping modification. [repeated 5x across cluster]
Iteration 191: reward_mean = 11.471017748314614
Iteration 192: reward_mean = 9.657852624359741
Iteration 193: reward_mean = 9.883666503712632
Iteration 194: reward_mean = 9.085432572543949
Iteration 195: reward_mean = 13.521027462298607
Iteration 196: reward_mean = 7.9942802007424865
Iteration 197: reward_mean = 10.045839523997499
Iteration 198: reward_mean = 14.86721708614407
Iteration 199: reward_mean = 12.404374472048037
Iteration 200: reward_mean = 10.479734609531858
(RolloutWorker pid=722849) [Info] No reference reward data found. Skipping modification.
Iteration 201: reward_mean = 16.225420413927857
Iteration 202: reward_mean = 12.034333929288104
Iteration 203: reward_mean = 13.422861648269933
Iteration 204: reward_mean = 12.56361952995035
Iteration 205: reward_mean = 11.457335675542792
Iteration 206: reward_mean = 12.324332364572767
Iteration 207: reward_mean = 13.753624638393077
Iteration 208: reward_mean = 9.520497031523465
Iteration 209: reward_mean = 14.36521881516431
Iteration 210: reward_mean = 11.739362454396456
Iteration 211: reward_mean = 9.360764015258603
Iteration 212: reward_mean = 11.872769169661193
Iteration 213: reward_mean = 11.428156706440797
Iteration 214: reward_mean = 11.642346505646383
Iteration 215: reward_mean = 14.228092880454831
Iteration 216: reward_mean = 13.54717439317209
Iteration 217: reward_mean = 14.870048393105018
Iteration 218: reward_mean = 9.646678422526215
Iteration 219: reward_mean = 12.91061276698833
Iteration 220: reward_mean = 18.372476340394417
Iteration 221: reward_mean = 14.245301548848206
Iteration 222: reward_mean = 11.357043377246356
Iteration 223: reward_mean = 9.593608139398285
Iteration 224: reward_mean = 11.61235867591063
Iteration 225: reward_mean = 10.790124256054712
Iteration 226: reward_mean = 8.648595933622865
Iteration 227: reward_mean = 15.258935572195348
Iteration 228: reward_mean = 13.785987650664476
Iteration 229: reward_mean = 12.632958327280752
Iteration 230: reward_mean = 14.147749642799491
(RolloutWorker pid=722859) [Info] No reference reward data found. Skipping modification.
Iteration 231: reward_mean = 13.161863904322221
Iteration 232: reward_mean = 13.838881754048222