UPSTREAM: mediatek/mt8183: Add DDR driver of rx dqs gating calibration part

BUG=b:80501386
BRANCH=none
TEST=Boots correctly on Kukui, and inits DRAM successfully with related
     patches.

Change-Id: Ie5712beea6e0640b97fff865dc6f0c6fe2305ccc
Signed-off-by: Patrick Georgi <pgeorgi@google.com>
Original-Commit-Id: fcdbce2decbf88a39b2da29ad1137a08e1d9ca95
Original-Change-Id: I504d6d5c9ea01b11a9f2a05b5ee4b5f1af87e23f
Original-Signed-off-by: Huayang Duan <huayang.duan@mediatek.com>
Original-Reviewed-on: https://review.coreboot.org/c/28841
Original-Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Original-Reviewed-by: You-Cheng Syu <youcheng@google.com>
Original-Reviewed-by: Hung-Te Lin <hungte@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/1355098
Commit-Ready: Patrick Georgi <pgeorgi@chromium.org>
Tested-by: Patrick Georgi <pgeorgi@chromium.org>
Reviewed-by: Patrick Georgi <pgeorgi@chromium.org>
diff --git a/src/soc/mediatek/mt8183/dramc_pi_calibration_api.c b/src/soc/mediatek/mt8183/dramc_pi_calibration_api.c
index 823c512..08264fd 100644
--- a/src/soc/mediatek/mt8183/dramc_pi_calibration_api.c
+++ b/src/soc/mediatek/mt8183/dramc_pi_calibration_api.c
@@ -20,6 +20,11 @@
 #include <soc/dramc_register.h>
 #include <soc/dramc_pi_api.h>
 
+enum {
+	GATING_START = 26,
+	GATING_END = GATING_START + 24,
+};
+
 static void auto_refresh_switch(u8 chn, u8 option)
 {
 	clrsetbits_le32(&ch[chn].ao.refctrl0, 1 << REFCTRL0_REFDIS_SHIFT,
@@ -86,7 +91,7 @@
 	clrsetbits_le32(&ch[chn].phy.shu[0].rk[rank].ca_cmd[9],
 		SHU1_CA_CMD9_RG_RK_ARFINE_TUNE_CLK_MASK, 0);
 
-	for (u8 i = 0; i < DQS_NUMBER; i++) {
+	for (size_t i = 0; i < DQS_NUMBER; i++) {
 		s32 wrlevel_dq_delay = wr_level[chn][rank][i] + 0x10;
 		assert(wrlevel_dq_delay < 0x40);
 
@@ -117,8 +122,8 @@
 
 static void dramc_read_dbi_onoff(u8 onoff)
 {
-	for (u8 chn = 0; chn < CHANNEL_MAX; chn++)
-		for (u8 b = 0; b < 2; b++)
+	for (size_t chn = 0; chn < CHANNEL_MAX; chn++)
+		for (size_t b = 0; b < 2; b++)
 			clrsetbits_le32(&ch[chn].phy.shu[0].b[b].dq[7],
 				0x1 << SHU1_BX_DQ7_R_DMDQMDBI_SHU_SHIFT,
 				onoff << SHU1_BX_DQ7_R_DMDQMDBI_SHU_SHIFT);
@@ -126,7 +131,7 @@
 
 static void dramc_write_dbi_onoff(u8 onoff)
 {
-	for (u8 chn = 0; chn < CHANNEL_MAX; chn++)
+	for (size_t chn = 0; chn < CHANNEL_MAX; chn++)
 		clrsetbits_le32(&ch[chn].ao.shu[0].wodt,
 			0x1 << SHU1_WODT_DBIWR_SHIFT,
 			onoff << SHU1_WODT_DBIWR_SHIFT);
@@ -150,11 +155,9 @@
 static void dramc_enable_phy_dcm(u8 en)
 {
 	u32 broadcast_bak = dramc_get_broadcast();
-	u8 chn = 0;
-
 	dramc_set_broadcast(DRAMC_BROADCAST_OFF);
 
-	for (chn = 0; chn < CHANNEL_MAX ; chn++) {
+	for (size_t chn = 0; chn < CHANNEL_MAX ; chn++) {
 		clrbits_le32(&ch[chn].phy.b[0].dll_fine_tune[1], 0x1 << 20);
 		clrbits_le32(&ch[chn].phy.b[1].dll_fine_tune[1], 0x1 << 20);
 		clrbits_le32(&ch[chn].phy.ca_dll_fine_tune[1], 0x1 << 20);
@@ -191,17 +194,17 @@
 			clrsetbits_le32(&shu->b[1].dq[7], mask, value);
 			clrsetbits_le32(&shu->ca_cmd[7], mask, value);
 		}
-	}
 
-	if (!en)
-		dramc_phy_dcm_disable(chn);
+		if (!en)
+			dramc_phy_dcm_disable(chn);
+	}
 	dramc_set_broadcast(broadcast_bak);
 }
 
 static void reset_delay_chain_before_calibration(void)
 {
-	for (u8 chn = 0; chn < CHANNEL_MAX; chn++)
-		for (u8 rank = 0; rank < RANK_MAX; rank++) {
+	for (size_t chn = 0; chn < CHANNEL_MAX; chn++)
+		for (size_t rank = 0; rank < RANK_MAX; rank++) {
 			struct dramc_ddrphy_regs_shu_rk *rk;
 			rk = &ch[chn].phy.shu[0].rk[rank];
 			clrbits_le32(&rk->ca_cmd[0], 0xffffff << 0);
@@ -233,8 +236,6 @@
 
 void dramc_apply_pre_calibration_config(void)
 {
-	u8 shu = 0;
-
 	dramc_enable_phy_dcm(0);
 	reset_delay_chain_before_calibration();
 
@@ -242,7 +243,7 @@
 	setbits_le32(&ch[0].ao.spcmdctrl, 0x1 << 24);
 	clrsetbits_le32(&ch[0].ao.shu[0].scintv, 0x1f << 1, 0x1b << 1);
 
-	for (shu = 0; shu < DRAM_DFS_SHUFFLE_MAX; shu++)
+	for (size_t shu = 0; shu < DRAM_DFS_SHUFFLE_MAX; shu++)
 		setbits_le32(&ch[0].ao.shu[shu].conf[3], 0x1ff << 0);
 
 	clrbits_le32(&ch[0].ao.dramctrl, 0x1 << 18);
@@ -254,10 +255,10 @@
 	dramc_write_dbi_onoff(DBI_OFF);
 	dramc_read_dbi_onoff(DBI_OFF);
 
-	for (int chn = 0; chn < CHANNEL_MAX; chn++) {
+	for (size_t chn = 0; chn < CHANNEL_MAX; chn++) {
 		setbits_le32(&ch[chn].ao.spcmdctrl, 0x1 << 29);
 		setbits_le32(&ch[chn].ao.dqsoscr, 0x1 << 24);
-		for (shu = 0; shu < DRAM_DFS_SHUFFLE_MAX; shu++)
+		for (size_t shu = 0; shu < DRAM_DFS_SHUFFLE_MAX; shu++)
 			setbits_le32(&ch[chn].ao.shu[shu].scintv, 0x1 << 30);
 
 		clrbits_le32(&ch[chn].ao.dummy_rd, (0x1 << 7) | (0x7 << 20));
@@ -287,6 +288,646 @@
 	}
 }
 
+static void rx_dqs_isi_pulse_cg_switch(u8 chn, bool flag)
+{
+	for (size_t b = 0; b < 2; b++)
+		clrsetbits_le32(&ch[chn].phy.b[b].dq[6], 1 << 5,
+				(flag ? 1 : 0) << 5);
+}
+
+static void dramc_set_rank_engine2(u8 chn, u8 rank)
+{
+	setbits_le32(&ch[chn].ao.dramctrl, 0x1 << 1);
+	clrbits_le32(&ch[chn].ao.test2_4, TEST2_4_TESTAGENTRKSEL_MASK);
+	clrsetbits_le32(&ch[chn].ao.test2_4, TEST2_4_TESTAGENTRK_MASK,
+		rank << TEST2_4_TESTAGENTRK_SHIFT);
+}
+
+static void dramc_engine2_init(u8 chn, u8 rank, u32 size, bool testaudpat)
+{
+	const u32 pat0 = 0x55;
+	const u32 pat1 = 0xaa;
+	const u32 addr = 0;
+
+	dramc_set_rank_engine2(chn, rank);
+
+	clrbits_le32(&ch[chn].ao.dummy_rd,
+		(0x1 << DUMMY_RD_DQSG_DMYRD_EN_SHIFT) |
+		(0x1 << DUMMY_RD_DQSG_DMYWR_EN_SHIFT) |
+		(0x1 << DUMMY_RD_DUMMY_RD_EN_SHIFT) |
+		(0x1 << DUMMY_RD_SREF_DMYRD_EN_SHIFT) |
+		(0x1 << DUMMY_RD_DMY_RD_DBG_SHIFT) |
+		(0x1 << DUMMY_RD_DMY_WR_DBG_SHIFT));
+	clrbits_le32(&ch[chn].nao.testchip_dma1,
+		0x1 << TESTCHIP_DMA1_DMA_LP4MATAB_OPT_SHIFT);
+	clrbits_le32(&ch[chn].ao.test2_3,
+		(0x1 << TEST2_3_TEST2W_SHIFT) |
+		(0x1 << TEST2_3_TEST2R_SHIFT) |
+		(0x1 << TEST2_3_TEST1_SHIFT));
+	clrsetbits_le32(&ch[chn].ao.test2_0,
+		TEST2_0_PAT0_MASK | TEST2_0_PAT1_MASK,
+		(pat0 << TEST2_0_PAT0_SHIFT) |
+		(pat1 << TEST2_0_PAT1_SHIFT));
+	write32(&ch[chn].ao.test2_1, (addr << 4) & 0x00ffffff);
+	write32(&ch[chn].ao.test2_2, (size << 4) & 0x00ffffff);
+
+	clrsetbits_le32(&ch[chn].ao.test2_4,
+		(0x1 << TEST2_4_TESTAUDMODE_SHIFT) |
+		(0x1 << TEST2_4_TESTAUDBITINV_SHIFT) |
+		(0x1 << TEST2_4_TESTXTALKPAT_SHIFT),
+		((!testaudpat ? 1 : 0) << TEST2_4_TESTXTALKPAT_SHIFT) |
+		((testaudpat ? 1 : 0) << TEST2_4_TESTAUDMODE_SHIFT) |
+		((testaudpat ? 1 : 0) << TEST2_4_TESTAUDBITINV_SHIFT));
+
+	if (!testaudpat) {
+		clrbits_le32(&ch[chn].ao.test2_4,
+			(0x1 << TEST2_4_TEST_REQ_LEN1_SHIFT) |
+			(0x1 << TEST2_4_TESTSSOPAT_SHIFT) |
+			(0x1 << TEST2_4_TESTSSOXTALKPAT_SHIFT));
+		setbits_le32(&ch[chn].ao.perfctl0,
+			0x1 << PERFCTL0_RWOFOEN_SHIFT);
+	} else {
+		clrsetbits_le32(&ch[chn].ao.test2_4,
+			TEST2_4_TESTAUDINIT_MASK | TEST2_4_TESTAUDINC_MASK,
+			(0x11 << TEST2_4_TESTAUDINIT_SHIFT) |
+			(0xd << TEST2_4_TESTAUDINC_SHIFT));
+	}
+	clrsetbits_le32(&ch[chn].ao.test2_3,
+		TEST2_3_TESTCNT_MASK | (0x1 << TEST2_3_TESTAUDPAT_SHIFT),
+		(testaudpat ? 1 : 0) << TEST2_3_TESTAUDPAT_SHIFT);
+}
+
+static void dramc_engine2_check_complete(u8 chn)
+{
+	u32 u4loop_count = 0;
+
+	/* In some case test engine finished but the complete signal late come,
+	 * system will wait very long time. Hence, we set a timeout here.
+	 * After system receive complete signal or wait until time out
+	 * it will return, the caller will check compare result to verify
+	 * whether engine success.
+	 */
+	while ((read32(&ch[chn].nao.testrpt) & 0x1) == 0) {
+		udelay(1);
+		u4loop_count++;
+
+		if (u4loop_count > MAX_CMP_CPT_WAIT_LOOP) {
+			dramc_dbg("MEASURE_A timeout\n");
+			break;
+		}
+	}
+}
+
+static u32 dramc_engine2_run(u8 chn, enum dram_te_op wr)
+{
+	u32 result;
+
+	if (wr == TE_OP_READ_CHECK) {
+		clrbits_le32(&ch[chn].ao.test2_4,
+			0x1 << TEST2_4_TESTAUDMODE_SHIFT);
+	} else if (wr == TE_OP_WRITE_READ_CHECK) {
+		clrsetbits_le32(&ch[chn].ao.test2_3,
+			(0x1 << TEST2_3_TEST2R_SHIFT) |
+			(0x1 << TEST2_3_TEST1_SHIFT),
+			0x1 << TEST2_3_TEST2W_SHIFT);
+
+		dramc_engine2_check_complete(chn);
+		clrbits_le32(&ch[chn].ao.test2_3,
+			 (0x1 << TEST2_3_TEST2W_SHIFT) |
+			 (0x1 << TEST2_3_TEST2R_SHIFT) |
+			 (0x1 << TEST2_3_TEST1_SHIFT));
+		udelay(1);
+	}
+
+	/* Do read test */
+	clrsetbits_le32(&ch[chn].ao.test2_3,
+		(0x1 << TEST2_3_TEST2W_SHIFT) | (0x1 << TEST2_3_TEST1_SHIFT),
+		0x1 << TEST2_3_TEST2R_SHIFT);
+
+	dramc_engine2_check_complete(chn);
+
+	udelay(1);
+	result = read32(&ch[chn].nao.cmp_err);
+	clrbits_le32(&ch[chn].ao.test2_3,
+		(0x1 << TEST2_3_TEST2W_SHIFT) |
+		(0x1 << TEST2_3_TEST2R_SHIFT) |
+		(0x1 << TEST2_3_TEST1_SHIFT));
+
+	return result;
+}
+
+static void dramc_engine2_end(u8 chn)
+{
+	clrbits_le32(&ch[chn].ao.test2_4, 0x1 << 17);
+}
+
+static void find_gating_window(u32 result_r, u32 result_f, u32 *debug_cnt,
+		u8 dly_coarse_large, u8 dly_coarse_0p5t, u8 *pass_begin,
+		u8 *pass_count, u8 *dly_fine_xt, u32 *coarse_tune, u8 *dqs_high)
+{
+	u16 debug_cnt_perbyte;
+	u8 pass_count_1[DQS_NUMBER];
+
+	for (u8 dqs = 0; dqs < DQS_NUMBER; dqs++) {
+		u8 dqs_result_r = (u8) ((result_r >> (8 * dqs)) & 0xff);
+		u8 dqs_result_f = (u8) ((result_f >> (8 * dqs)) & 0xff);
+
+		debug_cnt_perbyte = (u16) debug_cnt[dqs];
+		if (dqs_result_r != 0 || dqs_result_f != 0 ||
+		    debug_cnt_perbyte != GATING_GOLDEND_DQSCNT)
+			continue;
+
+		if (pass_begin[dqs] == 0) {
+			pass_begin[dqs] = 1;
+			pass_count_1[dqs] = 0;
+			dramc_dbg("[Byte %d]First pass (%d, %d, %d)\n",
+				  dqs, dly_coarse_large,
+				  dly_coarse_0p5t, *dly_fine_xt);
+		}
+
+		if (pass_begin[dqs] == 1)
+			pass_count_1[dqs]++;
+
+		if (pass_begin[dqs] == 1 &&
+		    pass_count_1[dqs] * DQS_GW_FINE_STEP > DQS_GW_FINE_END)
+			dqs_high[dqs] = 0;
+
+		if (pass_count_1[0] * DQS_GW_FINE_STEP > DQS_GW_FINE_END &&
+		    pass_count_1[1] * DQS_GW_FINE_STEP > DQS_GW_FINE_END) {
+			dramc_dbg("All bytes gating window > 1 coarse_tune,"
+				  " Early break\n");
+			*dly_fine_xt = DQS_GW_FINE_END;
+			*coarse_tune = GATING_END;
+		}
+	}
+}
+
+static void find_dly_tune(u8 chn, u8 dly_coarse_large, u8 dly_coarse_0p5t,
+		u8 dly_fine_xt, u8 *dqs_high, u8 *dly_coarse_large_cnt,
+		u8 *dly_coarse_0p5t_cnt, u8 *dly_fine_tune_cnt, u8 *dqs_trans)
+{
+	for (size_t dqs = 0; dqs < DQS_NUMBER; dqs++) {
+		u32 dqs_cnt = read32(&ch[chn].phy_nao.misc_phy_stben_b[dqs]);
+		dqs_cnt = (dqs_cnt >> 16) & 3;
+
+		if (dqs_cnt == 3)
+			dqs_high[dqs]++;
+
+		if (dqs_high[dqs] * DQS_GW_FINE_STEP <= 16)
+			continue;
+
+		switch (dqs_cnt) {
+		case 3:
+			dly_coarse_large_cnt[dqs] = dly_coarse_large;
+			dly_coarse_0p5t_cnt[dqs] = dly_coarse_0p5t;
+			dly_fine_tune_cnt[dqs] = dly_fine_xt;
+			dqs_trans[dqs] = 1;
+			break;
+		case 2:
+		case 1:
+			dqs_trans[dqs]++;
+			break;
+		case 0:
+			dqs_high[dqs] = 0;
+			break;
+		}
+	}
+}
+
+static void dram_phy_reset(u8 chn)
+{
+	setbits_le32(&ch[chn].ao.ddrconf0, 1 << DDRCONF0_RDATRST_SHIFT);
+	setbits_le32(&ch[chn].phy.misc_ctrl1, 1 << MISC_CTRL1_R_DMPHYRST_SHIFT);
+	clrbits_le32(&ch[chn].phy.b[0].dq[9], (1 << 4) | (1 << 0));
+	clrbits_le32(&ch[chn].phy.b[1].dq[9], (1 << 4) | (1 << 0));
+
+	udelay(1);
+	setbits_le32(&ch[chn].phy.b[1].dq[9], (1 << 4) | (1 << 0));
+	setbits_le32(&ch[chn].phy.b[0].dq[9], (1 << 4) | (1 << 0));
+	clrbits_le32(&ch[chn].phy.misc_ctrl1, 1 << MISC_CTRL1_R_DMPHYRST_SHIFT);
+	clrbits_le32(&ch[chn].ao.ddrconf0, 1 << DDRCONF0_RDATRST_SHIFT);
+}
+
+static void dramc_set_gating_mode(u8 chn, bool mode)
+{
+	u8 vref = 0, burst = 0;
+
+	if (mode) {
+		vref = 2;
+		burst = 1;
+	}
+
+	clrsetbits_le32(&ch[chn].ao.stbcal1, 0x1 << 5, burst << 5);
+	setbits_le32(&ch[chn].ao.stbcal, 0x1 << 30);
+
+	for (size_t b = 0; b < 2; b++) {
+		clrsetbits_le32(&ch[chn].phy.b[b].dq[6], 0x3 << 14, vref << 14);
+		setbits_le32(&ch[chn].phy.b[b].dq[9], 0x1 << 5);
+		clrbits_le32(&ch[chn].phy.b[b].dq[9], (0x1 << 4) | (0x1 << 0));
+		setbits_le32(&ch[chn].phy.b[b].dq[9], (0x1 << 4) | (0x1 << 0));
+	}
+}
+
+static void dramc_rx_dqs_gating_cal_pre(u8 chn, u8 rank)
+{
+	rx_dqs_isi_pulse_cg_switch(chn, DISABLE);
+	clrbits_le32(&ch[chn].ao.refctrl0, 1 << REFCTRL0_PBREFEN_SHIFT);
+
+	dramc_hw_gating_onoff(chn, GATING_OFF);
+
+	setbits_le32(&ch[chn].ao.stbcal1, 1 << STBCAL1_STBENCMPEN_SHIFT);
+	setbits_le32(&ch[chn].ao.stbcal1, 1 << STBCAL1_STBCNT_LATCH_EN_SHIFT);
+	clrbits_le32(&ch[chn].ao.ddrconf0, 1 << DDRCONF0_DM4TO1MODE_SHIFT);
+	setbits_le32(&ch[chn].ao.spcmd, 1 << SPCMD_DQSGCNTEN_SHIFT);
+
+	udelay(4);
+	setbits_le32(&ch[chn].ao.spcmd, 1 << SPCMD_DQSGCNTRST_SHIFT);
+	udelay(1);
+	clrbits_le32(&ch[chn].ao.spcmd, 1 << SPCMD_DQSGCNTRST_SHIFT);
+	clrsetbits_le32(&ch[chn].phy.misc_ctrl1,
+		1 << MISC_CTRL1_R_DMSTBENCMP_RK_OPT_SHIFT,
+		rank << MISC_CTRL1_R_DMSTBENCMP_RK_OPT_SHIFT);
+
+}
+
+static void dramc_write_dqs_gating_result(u8 chn, u8 rank,
+		u8 *best_coarse_tune2t, u8 *best_coarse_tune0p5t,
+		u8 *best_coarse_tune2t_p1, u8 *best_coarse_tune0p5t_p1,
+		u8 *best_fine_tune)
+{
+	u8 best_coarse_rodt[DQS_NUMBER], best_coarse_0p5t_rodt[DQS_NUMBER];
+	u8 best_coarse_rodt_p1[DQS_NUMBER];
+	u8 best_coarse_0p5t_rodt_p1[DQS_NUMBER];
+
+	rx_dqs_isi_pulse_cg_switch(chn, ENABLE);
+
+	write32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg0,
+		((u32) best_coarse_tune2t[0] <<
+		 SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_SHIFT) |
+		((u32) best_coarse_tune2t[1] <<
+		 SHURK_SELPH_DQSG0_TX_DLY_DQS1_GATED_SHIFT) |
+		((u32) best_coarse_tune2t_p1[0] <<
+		 SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_P1_SHIFT) |
+		((u32) best_coarse_tune2t_p1[1] <<
+		 SHURK_SELPH_DQSG0_TX_DLY_DQS1_GATED_P1_SHIFT));
+	write32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg1,
+		((u32) best_coarse_tune0p5t[0] <<
+		 SHURK_SELPH_DQSG1_REG_DLY_DQS0_GATED_SHIFT) |
+		((u32) best_coarse_tune0p5t[1] <<
+		 SHURK_SELPH_DQSG1_REG_DLY_DQS1_GATED_SHIFT) |
+		((u32) best_coarse_tune0p5t_p1[0] <<
+		 SHURK_SELPH_DQSG1_REG_DLY_DQS0_GATED_P1_SHIFT) |
+		((u32) best_coarse_tune0p5t_p1[1] <<
+		 SHURK_SELPH_DQSG1_REG_DLY_DQS1_GATED_P1_SHIFT));
+
+	for (size_t dqs = 0; dqs < DQS_NUMBER; dqs++) {
+		u8 tmp_value = (best_coarse_tune2t[dqs] << 3)
+			+ best_coarse_tune0p5t[dqs];
+
+		if (tmp_value >= 11) {
+			tmp_value -= 11;
+			best_coarse_rodt[dqs] = tmp_value >> 3;
+			best_coarse_0p5t_rodt[dqs] =
+				tmp_value - (best_coarse_rodt[dqs] << 3);
+
+			tmp_value = (best_coarse_tune2t_p1[dqs] << 3) +
+				best_coarse_tune0p5t_p1[dqs] - 11;
+			best_coarse_rodt_p1[dqs] = tmp_value >> 3;
+			best_coarse_0p5t_rodt_p1[dqs] =
+				tmp_value - (best_coarse_rodt_p1[dqs] << 3);
+
+			dramc_dbg("Best RODT dly(2T, 0.5T) = (%d, %d)\n",
+				  best_coarse_rodt[dqs],
+				  best_coarse_0p5t_rodt[dqs]);
+		} else {
+			best_coarse_rodt[dqs] = 0;
+			best_coarse_0p5t_rodt[dqs] = 0;
+			best_coarse_rodt_p1[dqs] = 4;
+			best_coarse_0p5t_rodt_p1[dqs] = 4;
+			dramc_dbg("RxdqsGatingCal error: best_coarse_tune2t:%d"
+				  " is already 0. RODT cannot be -1 coarse\n",
+				  dqs);
+		}
+	}
+
+	write32(&ch[chn].ao.shu[0].rk[rank].selph_odten0,
+		((u32) best_coarse_rodt[0] <<
+		 SHURK_SELPH_ODTEN0_TXDLY_B0_RODTEN_SHIFT) |
+		((u32) best_coarse_rodt[1] <<
+		 SHURK_SELPH_ODTEN0_TXDLY_B1_RODTEN_SHIFT) |
+		((u32) best_coarse_rodt_p1[0] <<
+		 SHURK_SELPH_ODTEN0_TXDLY_B0_RODTEN_P1_SHIFT) |
+		((u32) best_coarse_rodt_p1[1] <<
+		 SHURK_SELPH_ODTEN0_TXDLY_B1_RODTEN_P1_SHIFT));
+	write32(&ch[chn].ao.shu[0].rk[rank].selph_odten1,
+		((u32) best_coarse_0p5t_rodt[0] <<
+		 SHURK_SELPH_ODTEN1_DLY_B0_RODTEN_SHIFT) |
+		((u32) best_coarse_0p5t_rodt[1] <<
+		 SHURK_SELPH_ODTEN1_DLY_B1_RODTEN_SHIFT) |
+		((u32) best_coarse_0p5t_rodt_p1[0] <<
+		 SHURK_SELPH_ODTEN1_DLY_B0_RODTEN_P1_SHIFT) |
+		((u32) best_coarse_0p5t_rodt_p1[1] <<
+		 SHURK_SELPH_ODTEN1_DLY_B1_RODTEN_P1_SHIFT));
+
+	write32(&ch[chn].ao.shu[0].rk[rank].dqsien,
+		best_fine_tune[0] | (best_fine_tune[1] << 8));
+}
+
+static void dramc_rx_dqs_gating_cal(u8 chn, u8 rank)
+{
+	u8 dqs;
+	const u8 mr1_value = 0x56;
+	u8 pass_begin[DQS_NUMBER] = {0}, pass_count[DQS_NUMBER] = {0};
+	u8 min_coarse_tune2t[DQS_NUMBER], min_coarse_tune0p5t[DQS_NUMBER],
+		min_fine_tune[DQS_NUMBER];
+	u8 best_fine_tune[DQS_NUMBER], best_coarse_tune0p5t[DQS_NUMBER],
+		best_coarse_tune2t[DQS_NUMBER];
+	u8 best_coarse_tune0p5t_p1[DQS_NUMBER],
+		best_coarse_tune2t_p1[DQS_NUMBER];
+	u8 dqs_high[DQS_NUMBER] = {0}, dqs_transition[DQS_NUMBER] = {0};
+	u8 dly_coarse_large_cnt[DQS_NUMBER] = {0},
+		dly_coarse_0p5t_cnt[DQS_NUMBER] = {0},
+		dly_fine_tune_cnt[DQS_NUMBER] = {0};
+	u32 coarse_start = GATING_START, coarse_end = GATING_END;
+	u32 debug_cnt[DQS_NUMBER];
+
+	struct reg_value regs_bak[] = {
+		{&ch[chn].ao.stbcal, 0x0},
+		{&ch[chn].ao.stbcal1, 0x0},
+		{&ch[chn].ao.ddrconf0, 0x0},
+		{&ch[chn].ao.spcmd, 0x0},
+		{&ch[chn].ao.refctrl0, 0x0},
+		{&ch[chn].phy.b[0].dq[6], 0x0},
+		{&ch[chn].phy.b[1].dq[6], 0x0},
+	};
+	for (size_t i = 0; i < ARRAY_SIZE(regs_bak); i++)
+		regs_bak[i].value = read32(regs_bak[i].addr);
+
+	dramc_mode_reg_write_by_rank(chn, rank, 0x1, mr1_value | 0x80);
+	dramc_rx_dqs_gating_cal_pre(chn, rank);
+
+	u32 dummy_rd_backup = read32(&ch[chn].ao.dummy_rd);
+	dramc_engine2_init(chn, rank, 0x23, 1);
+
+	dramc_dbg("[Gating]\n");
+	for (u32 coarse_tune = coarse_start; coarse_tune < coarse_end;
+	     coarse_tune += DQS_GW_COARSE_STEP) {
+		u32 dly_coarse_large_rodt = 0, dly_coarse_0p5t_rodt = 0;
+		u32 dly_coarse_large_rodt_p1 = 4, dly_coarse_0p5t_rodt_p1 = 4;
+		u8 dly_coarse_large = coarse_tune / RX_DQS_CTL_LOOP;
+		u8 dly_coarse_0p5t = coarse_tune % RX_DQS_CTL_LOOP;
+		u32 dly_coarse_large_p1 =
+			(coarse_tune + DQS_GW_FREQ_DIV) / RX_DQS_CTL_LOOP;
+		u32 dly_coarse_0p5t_p1 =
+			(coarse_tune + DQS_GW_FREQ_DIV) % RX_DQS_CTL_LOOP;
+		u32 value = (dly_coarse_large << 3) + dly_coarse_0p5t;
+
+		if (value >= 11) {
+			value -= 11;
+			dly_coarse_large_rodt = value >> 3;
+			dly_coarse_0p5t_rodt =
+				value - (dly_coarse_large_rodt << 3);
+
+			value = (dly_coarse_large << 3) + dly_coarse_0p5t - 11;
+			dly_coarse_large_rodt_p1 = value >> 3;
+			dly_coarse_0p5t_rodt_p1 =
+				value - (dly_coarse_large_rodt_p1 << 3);
+		}
+
+		write32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg0,
+			((u32) dly_coarse_large <<
+			 SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_SHIFT) |
+			((u32) dly_coarse_large <<
+			 SHURK_SELPH_DQSG0_TX_DLY_DQS1_GATED_SHIFT) |
+			(dly_coarse_large_p1 <<
+			 SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_P1_SHIFT) |
+			(dly_coarse_large_p1 <<
+			 SHURK_SELPH_DQSG0_TX_DLY_DQS1_GATED_P1_SHIFT));
+		write32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg1,
+			((u32) dly_coarse_0p5t <<
+			 SHURK_SELPH_DQSG1_REG_DLY_DQS0_GATED_SHIFT) |
+			((u32) dly_coarse_0p5t <<
+			 SHURK_SELPH_DQSG1_REG_DLY_DQS1_GATED_SHIFT) |
+			(dly_coarse_0p5t_p1 <<
+			 SHURK_SELPH_DQSG1_REG_DLY_DQS0_GATED_P1_SHIFT) |
+			(dly_coarse_0p5t_p1 <<
+			 SHURK_SELPH_DQSG1_REG_DLY_DQS1_GATED_P1_SHIFT));
+		write32(&ch[chn].ao.shu[0].rk[rank].selph_odten0,
+			(dly_coarse_large_rodt <<
+			 SHURK_SELPH_ODTEN0_TXDLY_B0_RODTEN_SHIFT) |
+			(dly_coarse_large_rodt <<
+			 SHURK_SELPH_ODTEN0_TXDLY_B1_RODTEN_SHIFT) |
+			(dly_coarse_large_rodt_p1 <<
+			 SHURK_SELPH_ODTEN0_TXDLY_B0_RODTEN_P1_SHIFT) |
+			(dly_coarse_large_rodt_p1 <<
+			 SHURK_SELPH_ODTEN0_TXDLY_B1_RODTEN_P1_SHIFT));
+		write32(&ch[chn].ao.shu[0].rk[rank].selph_odten1,
+			(dly_coarse_0p5t_rodt <<
+			 SHURK_SELPH_ODTEN1_DLY_B0_RODTEN_SHIFT) |
+			(dly_coarse_0p5t_rodt <<
+			 SHURK_SELPH_ODTEN1_DLY_B1_RODTEN_SHIFT) |
+			(dly_coarse_0p5t_rodt_p1 <<
+			 SHURK_SELPH_ODTEN1_DLY_B0_RODTEN_P1_SHIFT) |
+			(dly_coarse_0p5t_rodt_p1 <<
+			 SHURK_SELPH_ODTEN1_DLY_B1_RODTEN_P1_SHIFT));
+
+		for (u8 dly_fine_xt = DQS_GW_FINE_START;
+			dly_fine_xt < DQS_GW_FINE_END;
+			dly_fine_xt += DQS_GW_FINE_STEP) {
+
+			dramc_set_gating_mode(chn, 0);
+
+			write32(&ch[chn].ao.shu[0].rk[rank].dqsien,
+				dly_fine_xt | (dly_fine_xt << 8));
+
+			dram_phy_reset(chn);
+			setbits_le32(&ch[chn].ao.spcmd,
+				1 << SPCMD_DQSGCNTRST_SHIFT);
+			udelay(1);
+			clrbits_le32(&ch[chn].ao.spcmd,
+				1 << SPCMD_DQSGCNTRST_SHIFT);
+
+			dramc_engine2_run(chn, TE_OP_READ_CHECK);
+
+			u32 result_r = read32(&ch[chn].phy.misc_stberr_rk0_r) &
+				    MISC_STBERR_RK_R_STBERR_RK_R_MASK;
+			u32 result_f = read32(&ch[chn].phy.misc_stberr_rk0_f) &
+				    MISC_STBERR_RK_F_STBERR_RK_F_MASK;
+			debug_cnt[0] = read32(&ch[chn].nao.dqsgnwcnt[0]);
+			debug_cnt[1] = (debug_cnt[0] >> 16) & 0xffff;
+			debug_cnt[0] &= 0xffff;
+
+			dramc_set_gating_mode(chn, 1);
+			dramc_engine2_run(chn, TE_OP_READ_CHECK);
+
+			find_dly_tune(chn, dly_coarse_large, dly_coarse_0p5t,
+				dly_fine_xt, dqs_high, dly_coarse_large_cnt,
+				dly_coarse_0p5t_cnt,
+				dly_fine_tune_cnt, dqs_transition);
+
+			dramc_dbg("%d %d %d |", dly_coarse_large,
+				  dly_coarse_0p5t, dly_fine_xt);
+			for (dqs = 0; dqs < DQS_NUMBER; dqs++)
+				dramc_dbg("%X ", debug_cnt[dqs]);
+
+			dramc_dbg(" |");
+			for (dqs = 0; dqs < DQS_NUMBER; dqs++) {
+				dramc_dbg("(%X %X)",
+				(result_f >> (DQS_BIT_NUMBER * dqs)) & 0xff,
+				(result_r >> (DQS_BIT_NUMBER * dqs)) & 0xff);
+			}
+
+			dramc_dbg("\n");
+			find_gating_window(result_r, result_f, debug_cnt,
+				dly_coarse_large, dly_coarse_0p5t, pass_begin,
+				pass_count, &dly_fine_xt, &coarse_tune,
+				dqs_high);
+		}
+	}
+
+	dramc_engine2_end(chn);
+	write32(&ch[chn].ao.dummy_rd, dummy_rd_backup);
+
+	for (dqs = 0; dqs < DQS_NUMBER; dqs++) {
+		pass_count[dqs] = dqs_transition[dqs];
+		min_fine_tune[dqs] = dly_fine_tune_cnt[dqs];
+		min_coarse_tune0p5t[dqs] = dly_coarse_0p5t_cnt[dqs];
+		min_coarse_tune2t[dqs] = dly_coarse_large_cnt[dqs];
+
+		u8 tmp_offset = pass_count[dqs] * DQS_GW_FINE_STEP / 2;
+		u8 tmp_value = min_fine_tune[dqs] + tmp_offset;
+		best_fine_tune[dqs] = tmp_value % RX_DLY_DQSIENSTB_LOOP;
+
+		tmp_offset = tmp_value / RX_DLY_DQSIENSTB_LOOP;
+		tmp_value = min_coarse_tune0p5t[dqs] + tmp_offset;
+		best_coarse_tune0p5t[dqs] = tmp_value % RX_DQS_CTL_LOOP;
+
+		tmp_offset = tmp_value / RX_DQS_CTL_LOOP;
+		best_coarse_tune2t[dqs] = min_coarse_tune2t[dqs] + tmp_offset;
+		tmp_value = best_coarse_tune0p5t[dqs] + DQS_GW_FREQ_DIV;
+		best_coarse_tune0p5t_p1[dqs] = tmp_value % RX_DQS_CTL_LOOP;
+
+		tmp_offset = tmp_value / RX_DQS_CTL_LOOP;
+		best_coarse_tune2t_p1[dqs] =
+			best_coarse_tune2t[dqs] + tmp_offset;
+	}
+
+	for (dqs = 0; dqs < DQS_NUMBER; dqs++)
+		dramc_show("Best DQS%d dly(2T, 0.5T, fine tune)"
+			   " = (%d, %d, %d)\n", dqs, best_coarse_tune2t[dqs],
+			   best_coarse_tune0p5t[dqs], best_fine_tune[dqs]);
+
+	for (dqs = 0; dqs < DQS_NUMBER; dqs++)
+		dramc_show("Best DQS%d coarse dly(2T, 0.5T, fine tune)"
+			   " = (%d, %d, %d)\n", dqs, best_coarse_tune2t_p1[dqs],
+			   best_coarse_tune0p5t_p1[dqs], best_fine_tune[dqs]);
+
+	for (size_t i = 0; i < ARRAY_SIZE(regs_bak); i++)
+		write32(regs_bak[i].addr, regs_bak[i].value);
+
+	dramc_mode_reg_write_by_rank(chn, rank, 0x1, mr1_value & 0x7f);
+
+	dramc_write_dqs_gating_result(chn, rank, best_coarse_tune2t,
+		best_coarse_tune0p5t, best_coarse_tune2t_p1,
+		best_coarse_tune0p5t_p1, best_fine_tune);
+
+	dram_phy_reset(chn);
+}
+
+static void dramc_rx_dqs_gating_post_process(u8 chn)
+{
+	u8 dqs, rank_rx_dvs, dqsinctl;
+	u32 read_dqsinctl, rankinctl_root, xrtr2r, reg_tx_dly_dqsgated_min = 3;
+	u8 txdly_cal_min = 0xff, txdly_cal_max = 0, tx_dly_dqs_gated = 0;
+	u32 best_coarse_tune2t[RANK_MAX][DQS_NUMBER];
+	u32 best_coarse_tune2t_p1[RANK_MAX][DQS_NUMBER];
+
+	rank_rx_dvs = reg_tx_dly_dqsgated_min - 1;
+
+	for (size_t b = 0; b < 2; b++)
+		clrsetbits_le32(&ch[chn].phy.shu[0].b[b].dq[7],
+			SHU1_BX_DQ7_R_DMRANKRXDVS_MASK,
+			rank_rx_dvs << SHU1_BX_DQ7_R_DMRANKRXDVS_SHIFT);
+
+	for (size_t rank = 0; rank < RANK_MAX; rank++) {
+		u32 dqsg0 = read32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg0);
+		for (dqs = 0; dqs < DQS_NUMBER; dqs++) {
+			best_coarse_tune2t[rank][dqs] =
+			   (dqsg0 >> (dqs * 8)) &
+			    SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_MASK;
+			best_coarse_tune2t_p1[rank][dqs] =
+			    ((dqsg0 >> (dqs * 8)) &
+			     SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_P1_MASK) >>
+			    SHURK_SELPH_DQSG0_TX_DLY_DQS0_GATED_P1_SHIFT;
+			dramc_dbg("Rank%d best DQS%d dly(2T,(P1)2T)=(%d, %d)\n",
+			      rank, dqs, best_coarse_tune2t[rank][dqs],
+			      best_coarse_tune2t_p1[rank][dqs]);
+
+			tx_dly_dqs_gated = best_coarse_tune2t[rank][dqs];
+			txdly_cal_min = MIN(txdly_cal_min, tx_dly_dqs_gated);
+
+			tx_dly_dqs_gated = best_coarse_tune2t_p1[rank][dqs];
+			txdly_cal_max = MAX(txdly_cal_max, tx_dly_dqs_gated);
+		}
+	}
+
+	dqsinctl = reg_tx_dly_dqsgated_min - txdly_cal_min;
+	dramc_dbg("Dqsinctl:%d, tx_dly_dqsgated_min %d, txdly_cal_min %d\n",
+		  dqsinctl, reg_tx_dly_dqsgated_min, txdly_cal_min);
+
+	if (dqsinctl != 0) {
+		txdly_cal_min += dqsinctl;
+		txdly_cal_max += dqsinctl;
+
+		for (size_t rank = 0; rank < RANK_MAX; rank++) {
+			dramc_dbg("Rank: %d\n", rank);
+			for (dqs = 0; dqs < DQS_NUMBER; dqs++) {
+				best_coarse_tune2t[rank][dqs] += dqsinctl;
+				best_coarse_tune2t_p1[rank][dqs] += dqsinctl;
+
+				dramc_dbg("Best DQS%d dly(2T) = (%d)\n",
+					  dqs, best_coarse_tune2t[rank][dqs]);
+				dramc_dbg("Best DQS%d P1 dly(2T) = (%d)\n",
+					  dqs,
+					  best_coarse_tune2t_p1[rank][dqs]);
+			}
+
+			write32(&ch[chn].ao.shu[0].rk[rank].selph_dqsg0,
+				(best_coarse_tune2t[rank][0] << 0) |
+				(best_coarse_tune2t[rank][1] << 8) |
+				(best_coarse_tune2t_p1[rank][0] << 4) |
+				(best_coarse_tune2t_p1[rank][1] << 12));
+		}
+	}
+
+	read_dqsinctl = (read32(&ch[chn].ao.shu[0].rk[0].dqsctl) &
+				SHURK_DQSCTL_DQSINCTL_MASK) - dqsinctl;
+	rankinctl_root = (read_dqsinctl >= 3) ? (read_dqsinctl - 3) : 0;
+
+	clrsetbits_le32(&ch[chn].ao.shu[0].rk[0].dqsctl,
+		SHURK_DQSCTL_DQSINCTL_MASK,
+		read_dqsinctl << SHURK_DQSCTL_DQSINCTL_SHIFT);
+	clrsetbits_le32(&ch[chn].ao.shu[0].rk[1].dqsctl,
+		SHURK_DQSCTL_DQSINCTL_MASK,
+		read_dqsinctl << SHURK_DQSCTL_DQSINCTL_SHIFT);
+	clrsetbits_le32(&ch[chn].ao.shu[0].rankctl,
+		SHU_RANKCTL_RANKINCTL_PHY_MASK |
+		SHU_RANKCTL_RANKINCTL_MASK | SHU_RANKCTL_RANKINCTL_ROOT1_MASK,
+		(read_dqsinctl << SHU_RANKCTL_RANKINCTL_PHY_SHIFT) |
+		(rankinctl_root << SHU_RANKCTL_RANKINCTL_SHIFT) |
+		(rankinctl_root << SHU_RANKCTL_RANKINCTL_ROOT1_SHIFT));
+
+	xrtr2r = MIN(8 + txdly_cal_max + 1, 12);
+	clrsetbits_le32(&ch[chn].ao.shu[0].actim_xrt,
+		SHU_ACTIM_XRT_XRTR2R_MASK,
+		xrtr2r << SHU_ACTIM_XRT_XRTR2R_SHIFT);
+
+	dramc_dbg("Tx_dly_DQS gated check: min %d max %d, changeDQSINCTL=%d,"
+		  " DQSINCTL=%d, RANKINCTL=%d, XRTR2R=%d\n",
+		  txdly_cal_min, txdly_cal_max, dqsinctl,
+		  read_dqsinctl, rankinctl_root, xrtr2r);
+}
+
 void dramc_calibrate_all_channels(const struct sdram_params *pams)
 {
 	for (u8 chn = 0; chn < CHANNEL_MAX; chn++) {
@@ -296,6 +937,9 @@
 			cmd_bus_training(chn, rk, pams);
 			dramc_write_leveling(chn, rk, pams->wr_level);
 			auto_refresh_switch(chn, 1);
+			dramc_rx_dqs_gating_cal(chn, rk);
 		}
+
+		dramc_rx_dqs_gating_post_process(chn);
 	}
 }
diff --git a/src/soc/mediatek/mt8183/include/soc/dramc_pi_api.h b/src/soc/mediatek/mt8183/include/soc/dramc_pi_api.h
index 6fc3ef7..3fb8c25 100644
--- a/src/soc/mediatek/mt8183/include/soc/dramc_pi_api.h
+++ b/src/soc/mediatek/mt8183/include/soc/dramc_pi_api.h
@@ -111,6 +111,11 @@
 	RESTORE_VALUE
 };
 
+struct reg_value {
+	u32 *addr;
+	u32 value;
+};
+
 enum {
 	DQ_DIV_SHIFT = 3,
 	DQ_DIV_MASK = BIT(DQ_DIV_SHIFT) - 1,