src/northbridge/intel/gm45/raminit_read_write_training.c - mirrors/cros/chromiumos/third_party/coreboot - Git at Google

 /*
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2012 secunet Security Networks AG
  * (Written by Nico Huber <nico.huber@secunet.com> for secunet)
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; version 2 of the License.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include <stdint.h>
 #include <arch/io.h>
 #include <pc80/mc146818rtc.h>
 #include <console/console.h>
 #include "gm45.h"

 typedef struct {
 	u32 addr[RANKS_PER_CHANNEL];
 	unsigned count;
 } address_bunch_t;

 /* Read Training. */
 #define CxRDTy_MCHBAR(ch, bl)	(0x14b0 + (ch * 0x0100) + ((7 - bl) * 4))
 #define CxRDTy_T_SHIFT		20
 #define CxRDTy_T_MASK		(0xf << CxRDTy_T_SHIFT)
 #define CxRDTy_T(t)		((t << CxRDTy_T_SHIFT) & CxRDTy_T_MASK)
 #define CxRDTy_P_SHIFT		16
 #define CxRDTy_P_MASK		(0x7 << CxRDTy_P_SHIFT)
 #define CxRDTy_P(p)		((p << CxRDTy_P_SHIFT) & CxRDTy_P_MASK)
 static const u32 read_training_schedule[] = {
 	0xfefefefe, 0x7f7f7f7f, 0xbebebebe, 0xdfdfdfdf,
 	0xeeeeeeee, 0xf7f7f7f7, 0xfafafafa, 0xfdfdfdfd,
 	0x00000000, 0x81818181, 0x40404040, 0x21212121,
 	0x10101010, 0x09090909, 0x04040404, 0x03030303,
 	0x10101010, 0x11111111, 0xeeeeeeee, 0xefefefef,
 	0x10101010, 0x11111111, 0xeeeeeeee, 0xefefefef,
 	0x10101010, 0xefefefef, 0x10101010, 0xefefefef,
 	0x10101010, 0xefefefef, 0x10101010, 0xefefefef,
 	0x00000000, 0xffffffff, 0x00000000, 0xffffffff,
 	0x00000000, 0xffffffff, 0x00000000, 0x00000000,
 };
 #define READ_TIMING_P_SHIFT	3
 #define READ_TIMING_P_BOUND	(1 << READ_TIMING_P_SHIFT)
 #define READ_TIMING_T_BOUND	14
 typedef struct {
 	int t;
 	int p;
 } read_timing_t;
 static void normalize_read_timing(read_timing_t *const timing)
 {
 	while (timing->p >= READ_TIMING_P_BOUND) {
 		timing->t++;
 		timing->p -= READ_TIMING_P_BOUND;
 	}
 	while (timing->p < 0) {
 		timing->t--;
 		timing->p += READ_TIMING_P_BOUND;
 	}
 	if ((timing->t < 0) || (timing->t >= READ_TIMING_T_BOUND))
 		die("Timing under-/overflow during read training.\n");
 }
 static void program_read_timing(const int ch, const int lane,
 				read_timing_t *const timing)
 {
 	normalize_read_timing(timing);

 	u32 reg = MCHBAR32(CxRDTy_MCHBAR(ch, lane));
 	reg &= ~(CxRDTy_T_MASK | CxRDTy_P_MASK);
 	reg |= CxRDTy_T(timing->t) | CxRDTy_P(timing->p);
 	MCHBAR32(CxRDTy_MCHBAR(ch, lane)) = reg;
 }
 /* Returns 1 on success, 0 on failure. */
 static int read_training_test(const int channel, const int lane,
 			      const address_bunch_t *const addresses)
 {
 	int i;

 	const int lane_offset = lane & 4;
 	const int lane_mask = 0xff << ((lane & ~4) << 3);

 	for (i = 0; i < addresses->count; ++i) {
 		unsigned int offset;
 		for (offset = lane_offset; offset < 320; offset += 8) {
 			const u32 read = read32(addresses->addr[i] + offset);
 			const u32 good = read_training_schedule[offset >> 3];
 			if ((read & lane_mask) != (good & lane_mask))
 				return 0;
 		}
 	}
 	return 1;
 }
 static void read_training_per_lane(const int channel, const int lane,
 				   const address_bunch_t *const addresses)
 {
 	read_timing_t lower, upper;

 	MCHBAR32(CxRDTy_MCHBAR(channel, lane)) |= 3 << 25;

 	/* Search lower bound. */
 	lower.t = 0;
 	lower.p = 0;
 	program_read_timing(channel, lane, &lower);
 	/* Coarse search for good t. */
 	while (!read_training_test(channel, lane, addresses)) {
 		++lower.t;
 		program_read_timing(channel, lane, &lower);
 	}
 	/* Step back, then fine search for good p. */
 	if (lower.t > 0) {
 		--lower.t;
 		program_read_timing(channel, lane, &lower);
 		while (!read_training_test(channel, lane, addresses)) {
 			++lower.p;
 			program_read_timing(channel, lane, &lower);
 		}
 	}

 	/* Search upper bound. */
 	upper.t = lower.t + 1;
 	upper.p = lower.p;
 	program_read_timing(channel, lane, &upper);
 	if (!read_training_test(channel, lane, addresses))
 		die("Read training failed: limits too narrow.\n");
 	/* Coarse search for bad t. */
 	do {
 		++upper.t;
 		program_read_timing(channel, lane, &upper);
 	} while (read_training_test(channel, lane, addresses));
 	/* Fine search for bad p. */
 	--upper.t;
 	program_read_timing(channel, lane, &upper);
 	while (read_training_test(channel, lane, addresses)) {
 		++upper.p;
 		program_read_timing(channel, lane, &upper);
 	}

 	/* Calculate and program mean value. */
 	lower.p += lower.t << READ_TIMING_P_SHIFT;
 	upper.p += upper.t << READ_TIMING_P_SHIFT;
 	const int mean_p = (lower.p + upper.p) >> 1;
 	/* lower becomes the mean value. */
 	lower.t = mean_p >> READ_TIMING_P_SHIFT;
 	lower.p = mean_p & (READ_TIMING_P_BOUND - 1);
 	program_read_timing(channel, lane, &lower);
 	printk(BIOS_DEBUG, "Final timings for byte lane %d on channel %d: "
 			   "%d.%d\n", lane, channel, lower.t, lower.p);
 }
 static void perform_read_training(const dimminfo_t *const dimms)
 {
 	int ch, i;

 	FOR_EACH_POPULATED_CHANNEL(dimms, ch) {
 		address_bunch_t addresses = { { 0, }, 0 };
 		FOR_EACH_POPULATED_RANK_IN_CHANNEL(dimms, ch, i)
 			addresses.addr[addresses.count++] =
 				raminit_get_rank_addr(ch, i);

 		for (i = 0; i < addresses.count; ++i) {
 			/* Write test pattern. */
 			unsigned int offset;
 			for (offset = 0; offset < 320; offset += 4)
 				write32(addresses.addr[i] + offset,
 					read_training_schedule[offset >> 3]);
 		}

 		for (i = 0; i < 8; ++i)
 			read_training_per_lane(ch, i, &addresses);
 	}
 }
 static void read_training_store_results(void)
 {
 	u8 bytes[TOTAL_CHANNELS * 8];
 	int ch, i;

 	/* Store one timing pair in one byte each. */
 	FOR_EACH_CHANNEL(ch) {
 		for (i = 0; i < 8; ++i) {
 			const u32 bl_reg = MCHBAR32(CxRDTy_MCHBAR(ch, i));
 			bytes[(ch * 8) + i] =
 				(((bl_reg & CxRDTy_T_MASK) >> CxRDTy_T_SHIFT)
 				 << 4) |
 				((bl_reg & CxRDTy_P_MASK) >> CxRDTy_P_SHIFT);
 		}
 	}

 	/* Store everything in CMOS above 128 bytes. */
 	for (i = 0; i < (TOTAL_CHANNELS * 8); ++i)
 		cmos_write(bytes[i], CMOS_READ_TRAINING + i);
 }
 static void read_training_restore_results(void)
 {
 	u8 bytes[TOTAL_CHANNELS * 8];
 	int ch, i;

 	/* Read from CMOS. */
 	for (i = 0; i < (TOTAL_CHANNELS * 8); ++i)
 		bytes[i] = cmos_read(CMOS_READ_TRAINING + i);

 	/* Program restored results. */
 	FOR_EACH_CHANNEL(ch) {
 		for (i = 0; i < 8; ++i) {
 			const int t = bytes[(ch * 8) + i] >> 4;
 			const int p = bytes[(ch * 8) + i] & 7;
 			u32 bl_reg = MCHBAR32(CxRDTy_MCHBAR(ch, i));
 			bl_reg &= ~(CxRDTy_T_MASK | CxRDTy_P_MASK);
 			bl_reg |= (3 << 25) | CxRDTy_T(t) | CxRDTy_P(p);
 			MCHBAR32(CxRDTy_MCHBAR(ch, i)) = bl_reg;
 			printk(BIOS_DEBUG, "Restored timings for byte lane "
 			       "%d on channel %d: %d.%d\n", i, ch, t, p);
 		}
 	}
 }
 void raminit_read_training(const dimminfo_t *const dimms, const int s3resume)
 {
 	if (!s3resume) {
 		perform_read_training(dimms);
 		read_training_store_results();
 	} else {
 		read_training_restore_results();
 	}
 	raminit_reset_readwrite_pointers();
 }

 /* Write Training. */
 #define CxWRTy_T_SHIFT		28
 #define CxWRTy_T_MASK		(0xf << CxWRTy_T_SHIFT)
 #define CxWRTy_T(t)		((t << CxWRTy_T_SHIFT) & CxWRTy_T_MASK)
 #define CxWRTy_P_SHIFT		24
 #define CxWRTy_P_MASK		(0x7 << CxWRTy_P_SHIFT)
 #define CxWRTy_P(p)		((p << CxWRTy_P_SHIFT) & CxWRTy_P_MASK)
 #define CxWRTy_F_SHIFT		18
 #define CxWRTy_F_MASK		(0x3 << CxWRTy_F_SHIFT)
 #define CxWRTy_F(f)		((f << CxWRTy_F_SHIFT) & CxWRTy_F_MASK)
 #define CxWRTy_D_SHIFT		16
 #define CxWRTy_D_MASK		(0x3 << CxWRTy_D_SHIFT)
 #define CxWRTy_BELOW_D		(0x3 << CxWRTy_D_SHIFT)
 #define CxWRTy_ABOVE_D		(0x1 << CxWRTy_D_SHIFT)
 static const u32 write_training_schedule[] = {
 	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
 	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
 	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
 	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
 	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
 	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
 	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
 	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
 	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
 	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
 	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
 	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
 	0x03030303, 0x04040404, 0x09090909, 0x10101010,
 	0x21212121, 0x40404040, 0x81818181, 0x00000000,
 	0x03030303, 0x04040404, 0x09090909, 0x10101010,
 	0x21212121, 0x40404040, 0x81818181, 0x00000000,
 	0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
 	0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
 	0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
 	0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
 };
 /* for raw card types A, B and C: MEM_CLOCK_1067MT? X group X lower/upper */
 static const u32 write_training_bytelane_masks_abc[2][4][2] = {
 	{ /* clock < MEM_CLOCK_1067MT */
 		{ 0xffffffff, 0x00000000 }, { 0x00000000, 0x00000000 },
 		{ 0x00000000, 0xffffffff }, { 0x00000000, 0x00000000 },
 	},
 	{ /* clock == MEM_CLOCK_1067MT */
 		{ 0x0000ffff, 0x00000000 }, { 0xffff0000, 0x00000000 },
 		{ 0x00000000, 0x0000ffff }, { 0x00000000, 0xffff0000 },
 	},
 };
 /* for raw card type F: group X lower/upper */
 static const u32 write_training_bytelane_masks_f[4][2] = {
 	{ 0xff00ff00, 0x00000000 }, { 0x00ff00ff, 0x00000000 },
 	{ 0x00000000, 0xff00ff00 }, { 0x00000000, 0x00ff00ff },
 };
 #define WRITE_TIMING_P_SHIFT	3
 #define WRITE_TIMING_P_BOUND	(1 << WRITE_TIMING_P_SHIFT)
 #define WRITE_TIMING_F_BOUND	4
 typedef struct {
 	int f;
 	int t;
 	const int t_bound;
 	int p;
 } write_timing_t;
 static void normalize_write_timing(write_timing_t *const timing)
 {
 	while (timing->p >= WRITE_TIMING_P_BOUND) {
 		timing->t++;
 		timing->p -= WRITE_TIMING_P_BOUND;
 	}
 	while (timing->p < 0) {
 		timing->t--;
 		timing->p += WRITE_TIMING_P_BOUND;
 	}
 	while (timing->t >= timing->t_bound) {
 		timing->f++;
 		timing->t -= timing->t_bound;
 	}
 	while (timing->t < 0) {
 		timing->f--;
 		timing->t += timing->t_bound;
 	}
 	if ((timing->f < 0) || (timing->f >= WRITE_TIMING_F_BOUND))
 		die("Timing under-/overflow during write training.\n");
 }
 static void program_write_timing(const int ch, const int group,
 				 write_timing_t *const timing, int memclk1067)
 {
 	/* MEM_CLOCK_1067MT? X lower/upper */
 	const u32 d_bounds[2][2] = { { 1, 6 }, { 2, 9 } };

 	normalize_write_timing(timing);

 	const int f = timing->f;
 	const int t = timing->t;
 	const int p = (memclk1067 && (((t ==  9) && (timing->p >= 4)) ||
 				       ((t == 10) && (timing->p < 4))))
 		? 4 : timing->p;
 	const int d =
 		(t <= d_bounds[memclk1067][0]) ? CxWRTy_BELOW_D :
 		((t >  d_bounds[memclk1067][1]) ? CxWRTy_ABOVE_D : 0);

 	u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, group));
 	reg &= ~(CxWRTy_T_MASK | CxWRTy_P_MASK | CxWRTy_F_MASK);
 	reg &= ~CxWRTy_D_MASK;
 	reg |= CxWRTy_T(t) | CxWRTy_P(p) | CxWRTy_F(f) | d;
 	MCHBAR32(CxWRTy_MCHBAR(ch, group)) = reg;
 }
 /* Returns 1 on success, 0 on failure. */
 static int write_training_test(const address_bunch_t *const addresses,
 			       const u32 *const masks)
 {
 	int i, ret = 0;

 	const u32 mmarb0 = MCHBAR32(0x0220);
 	const u8  wrcctl = MCHBAR8(0x0218);
 	MCHBAR32(0x0220) |= 0xf << 28;
 	MCHBAR8(0x0218)  |= 0x1 <<  4;

 	for (i = 0; i < addresses->count; ++i) {
 		const unsigned int addr = addresses->addr[i];
 		unsigned int off;
 		for (off = 0; off < 640; off += 8) {
 			const u32 pattern = write_training_schedule[off >> 3];
 			write32(addr + off, pattern);
 			write32(addr + off + 4, pattern);
 		}

 		MCHBAR8(0x78) |= 1;

 		for (off = 0; off < 640; off += 8) {
 			const u32 good = write_training_schedule[off >> 3];
 			const u32 read1 = read32(addr + off);
 			if ((read1 & masks[0]) != (good & masks[0]))
 				goto _bad_timing_out;
 			const u32 read2 = read32(addr + off + 4);
 			if ((read2 & masks[1]) != (good & masks[1]))
 				goto _bad_timing_out;
 		}
 	}
 	ret = 1;

 _bad_timing_out:
 	MCHBAR32(0x0220) = mmarb0;
 	MCHBAR8(0x0218)  = wrcctl;

 	return ret;
 }
 static void write_training_per_group(const int ch, const int group,
 				     const address_bunch_t *const addresses,
 				     const u32 masks[][2], const int memclk1067)
 {
 	const int t_bound = memclk1067 ? 12 : 11;
 	write_timing_t lower = { 0, 0, t_bound, 0 },
 		       upper = { 0, 0, t_bound, 0 };

 	/* Search lower bound. */
 	const u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, group));
 	lower.t =  (reg >> 12) & 0xf;
 	lower.p =  (reg >>  8) & 0x7;
 	lower.f = ((reg >>  2) & 0x3) - 1;
 	program_write_timing(ch, group, &lower, memclk1067);
 	/* Coarse search for good t. */
 	while (!write_training_test(addresses, masks[group])) {
 		++lower.t;
 		program_write_timing(ch, group, &lower, memclk1067);
 	}
 	/* Fine search for good p. */
 	--lower.t;
 	program_write_timing(ch, group, &lower, memclk1067);
 	while (!write_training_test(addresses, masks[group])) {
 		++lower.p;
 		program_write_timing(ch, group, &lower, memclk1067);
 	}

 	/* Search upper bound. */
 	upper.t = lower.t + 3;
 	upper.p = lower.p;
 	upper.f = lower.f;
 	program_write_timing(ch, group, &upper, memclk1067);
 	if (!write_training_test(addresses, masks[group]))
 		die("Write training failed; limits too narrow.\n");
 	/* Coarse search for good t. */
 	while (write_training_test(addresses, masks[group])) {
 		++upper.t;
 		program_write_timing(ch, group, &upper, memclk1067);
 	}
 	/* Fine search for good p. */
 	--upper.t;
 	program_write_timing(ch, group, &upper, memclk1067);
 	while (write_training_test(addresses, masks[group])) {
 		++upper.p;
 		program_write_timing(ch, group, &upper, memclk1067);
 	}

 	/* Calculate and program mean value. */
 	lower.t += lower.f * lower.t_bound;
 	lower.p += lower.t << WRITE_TIMING_P_SHIFT;
 	upper.t += upper.f * upper.t_bound;
 	upper.p += upper.t << WRITE_TIMING_P_SHIFT;
 	/* lower becomes the mean value. */
 	const int mean_p = (lower.p + upper.p) >> 1;
 	lower.f = mean_p / (lower.t_bound << WRITE_TIMING_P_SHIFT);
 	lower.t = (mean_p >> WRITE_TIMING_P_SHIFT) % lower.t_bound;
 	lower.p = mean_p & (WRITE_TIMING_P_BOUND - 1);
 	program_write_timing(ch, group, &lower, memclk1067);

 	printk(BIOS_DEBUG, "Final timings for group %d"
 			   " on channel %d: %d.%d.%d\n",
 	       group, ch, lower.f, lower.t, lower.p);
 }
 static void perform_write_training(const int memclk1067,
 				   const dimminfo_t *const dimms)
 {
 	const int cardF[] = { dimms[0].card_type == 0xf,
 			      dimms[1].card_type == 0xf };
 	int ch, r, group;

 	address_bunch_t addr[2] = { { { 0, }, 0 }, { { 0, }, 0 }, };
 	/* Add check if channel A is populated, i.e. if cardF[0] is valid.
 	 * Otherwise we would write channel A registers when DIMM in channel B
 	 * is of raw card type A, B or C (cardF[1] == 0) even if channel A is
 	 * not populated.
 	 * Needs raw card type A, B or C for testing. */
 	if ((dimms[0].card_type != 0) && (cardF[0] == cardF[1])) {
 		/* Common path for both channels. */
 		FOR_EACH_POPULATED_RANK(dimms, ch, r)
 			addr[0].addr[addr[0].count++] =
 				raminit_get_rank_addr(ch, r);
 	} else {
 		FOR_EACH_POPULATED_RANK(dimms, ch, r)
 			addr[ch].addr[addr[ch].count++] =
 				raminit_get_rank_addr(ch, r);
 	}

 	FOR_EACH_CHANNEL(ch) if (addr[ch].count > 0) {
 		const u32 (*const masks)[2] = (!cardF[ch])
 			? write_training_bytelane_masks_abc[memclk1067]
 			: write_training_bytelane_masks_f;
 		for (group = 0; group < 4; ++group) {
 			if (!masks[group][0] && !masks[group][1])
 				continue;
 			write_training_per_group(
 				ch, group, &addr[ch], masks, memclk1067);
 		}
 	}
 }
 static void write_training_store_results(void)
 {
 	u8 bytes[TOTAL_CHANNELS * 4 * 2]; /* two bytes per group */
 	int ch, i;

 	/* Store one T/P pair in one, F in the other byte. */
 	/* We could save six bytes by putting all F values in two bytes. */
 	FOR_EACH_CHANNEL(ch) {
 		for (i = 0; i < 4; ++i) {
 			const u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, i));
 			bytes[(ch * 8) + (i * 2)] =
 				(((reg & CxWRTy_T_MASK)
 				  >> CxWRTy_T_SHIFT) << 4) |
 				((reg & CxWRTy_P_MASK) >> CxWRTy_P_SHIFT);
 			bytes[(ch * 8) + (i * 2) + 1] =
 				((reg & CxWRTy_F_MASK) >> CxWRTy_F_SHIFT);
 		}
 	}

 	/* Store everything in CMOS above 128 bytes. */
 	for (i = 0; i < (TOTAL_CHANNELS * 4 * 2); ++i)
 		cmos_write(bytes[i], CMOS_WRITE_TRAINING + i);
 }
 static void write_training_restore_results(const int memclk1067)
 {
 	const int t_bound = memclk1067 ? 12 : 11;

 	u8 bytes[TOTAL_CHANNELS * 4 * 2]; /* two bytes per group */
 	int ch, i;

 	/* Read from CMOS. */
 	for (i = 0; i < (TOTAL_CHANNELS * 4 * 2); ++i)
 		bytes[i] = cmos_read(CMOS_WRITE_TRAINING + i);

 	/* Program with original program_write_timing(). */
 	FOR_EACH_CHANNEL(ch) {
 		for (i = 0; i < 4; ++i) {
 			write_timing_t timing = { 0, 0, t_bound, 0 };
 			timing.f = bytes[(ch * 8) + (i * 2) + 1] & 3;
 			timing.t = bytes[(ch * 8) + (i * 2)] >> 4;
 			timing.p = bytes[(ch * 8) + (i * 2)] & 7;
 			program_write_timing(ch, i, &timing, memclk1067);
 			printk(BIOS_DEBUG, "Restored timings for group %d "
 					   "on channel %d: %d.%d.%d\n",
 			       i, ch, timing.f, timing.t, timing.p);
 		}
 	}
 }
 void raminit_write_training(const mem_clock_t ddr3clock,
 			    const dimminfo_t *const dimms,
 			    const int s3resume)
 {
 	const int memclk1067 = ddr3clock == MEM_CLOCK_1067MT;

 	if (!s3resume) {
 		perform_write_training(memclk1067, dimms);
 		write_training_store_results();
 	} else {
 		write_training_restore_results(memclk1067);
 	}
 	raminit_reset_readwrite_pointers();
 }
	/*
	* This file is part of the coreboot project.
	*
	* Copyright (C) 2012 secunet Security Networks AG
	* (Written by Nico Huber <nico.huber@secunet.com> for secunet)
	*
	* This program is free software; you can redistribute it and/or modify
	* it under the terms of the GNU General Public License as published by
	* the Free Software Foundation; version 2 of the License.
	*
	* This program is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	* GNU General Public License for more details.
	*
	* You should have received a copy of the GNU General Public License
	* along with this program; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <stdint.h>
	#include <arch/io.h>
	#include <pc80/mc146818rtc.h>
	#include <console/console.h>
	#include "gm45.h"

	typedef struct {
	u32 addr[RANKS_PER_CHANNEL];
	unsigned count;
	} address_bunch_t;

	/* Read Training. */
	#define CxRDTy_MCHBAR(ch, bl) (0x14b0 + (ch * 0x0100) + ((7 - bl) * 4))
	#define CxRDTy_T_SHIFT 20
	#define CxRDTy_T_MASK (0xf << CxRDTy_T_SHIFT)
	#define CxRDTy_T(t) ((t << CxRDTy_T_SHIFT) & CxRDTy_T_MASK)
	#define CxRDTy_P_SHIFT 16
	#define CxRDTy_P_MASK (0x7 << CxRDTy_P_SHIFT)
	#define CxRDTy_P(p) ((p << CxRDTy_P_SHIFT) & CxRDTy_P_MASK)
	static const u32 read_training_schedule[] = {
	0xfefefefe, 0x7f7f7f7f, 0xbebebebe, 0xdfdfdfdf,
	0xeeeeeeee, 0xf7f7f7f7, 0xfafafafa, 0xfdfdfdfd,
	0x00000000, 0x81818181, 0x40404040, 0x21212121,
	0x10101010, 0x09090909, 0x04040404, 0x03030303,
	0x10101010, 0x11111111, 0xeeeeeeee, 0xefefefef,
	0x10101010, 0x11111111, 0xeeeeeeee, 0xefefefef,
	0x10101010, 0xefefefef, 0x10101010, 0xefefefef,
	0x10101010, 0xefefefef, 0x10101010, 0xefefefef,
	0x00000000, 0xffffffff, 0x00000000, 0xffffffff,
	0x00000000, 0xffffffff, 0x00000000, 0x00000000,
	};
	#define READ_TIMING_P_SHIFT 3
	#define READ_TIMING_P_BOUND (1 << READ_TIMING_P_SHIFT)
	#define READ_TIMING_T_BOUND 14
	typedef struct {
	int t;
	int p;
	} read_timing_t;
	static void normalize_read_timing(read_timing_t *const timing)
	{
	while (timing->p >= READ_TIMING_P_BOUND) {
	timing->t++;
	timing->p -= READ_TIMING_P_BOUND;
	}
	while (timing->p < 0) {
	timing->t--;
	timing->p += READ_TIMING_P_BOUND;
	}
	if ((timing->t < 0) \|\| (timing->t >= READ_TIMING_T_BOUND))
	die("Timing under-/overflow during read training.\n");
	}
	static void program_read_timing(const int ch, const int lane,
	read_timing_t *const timing)
	{
	normalize_read_timing(timing);

	u32 reg = MCHBAR32(CxRDTy_MCHBAR(ch, lane));
	reg &= ~(CxRDTy_T_MASK \| CxRDTy_P_MASK);
	reg \|= CxRDTy_T(timing->t) \| CxRDTy_P(timing->p);
	MCHBAR32(CxRDTy_MCHBAR(ch, lane)) = reg;
	}
	/* Returns 1 on success, 0 on failure. */
	static int read_training_test(const int channel, const int lane,
	const address_bunch_t *const addresses)
	{
	int i;

	const int lane_offset = lane & 4;
	const int lane_mask = 0xff << ((lane & ~4) << 3);

	for (i = 0; i < addresses->count; ++i) {
	unsigned int offset;
	for (offset = lane_offset; offset < 320; offset += 8) {
	const u32 read = read32(addresses->addr[i] + offset);
	const u32 good = read_training_schedule[offset >> 3];
	if ((read & lane_mask) != (good & lane_mask))
	return 0;
	}
	}
	return 1;
	}
	static void read_training_per_lane(const int channel, const int lane,
	const address_bunch_t *const addresses)
	{
	read_timing_t lower, upper;

	MCHBAR32(CxRDTy_MCHBAR(channel, lane)) \|= 3 << 25;

	/* Search lower bound. */
	lower.t = 0;
	lower.p = 0;
	program_read_timing(channel, lane, &lower);
	/* Coarse search for good t. */
	while (!read_training_test(channel, lane, addresses)) {
	++lower.t;
	program_read_timing(channel, lane, &lower);
	}
	/* Step back, then fine search for good p. */
	if (lower.t > 0) {
	--lower.t;
	program_read_timing(channel, lane, &lower);
	while (!read_training_test(channel, lane, addresses)) {
	++lower.p;
	program_read_timing(channel, lane, &lower);
	}
	}

	/* Search upper bound. */
	upper.t = lower.t + 1;
	upper.p = lower.p;
	program_read_timing(channel, lane, &upper);
	if (!read_training_test(channel, lane, addresses))
	die("Read training failed: limits too narrow.\n");
	/* Coarse search for bad t. */
	do {
	++upper.t;
	program_read_timing(channel, lane, &upper);
	} while (read_training_test(channel, lane, addresses));
	/* Fine search for bad p. */
	--upper.t;
	program_read_timing(channel, lane, &upper);
	while (read_training_test(channel, lane, addresses)) {
	++upper.p;
	program_read_timing(channel, lane, &upper);
	}

	/* Calculate and program mean value. */
	lower.p += lower.t << READ_TIMING_P_SHIFT;
	upper.p += upper.t << READ_TIMING_P_SHIFT;
	const int mean_p = (lower.p + upper.p) >> 1;
	/* lower becomes the mean value. */
	lower.t = mean_p >> READ_TIMING_P_SHIFT;
	lower.p = mean_p & (READ_TIMING_P_BOUND - 1);
	program_read_timing(channel, lane, &lower);
	printk(BIOS_DEBUG, "Final timings for byte lane %d on channel %d: "
	"%d.%d\n", lane, channel, lower.t, lower.p);
	}
	static void perform_read_training(const dimminfo_t *const dimms)
	{
	int ch, i;

	FOR_EACH_POPULATED_CHANNEL(dimms, ch) {
	address_bunch_t addresses = { { 0, }, 0 };
	FOR_EACH_POPULATED_RANK_IN_CHANNEL(dimms, ch, i)
	addresses.addr[addresses.count++] =
	raminit_get_rank_addr(ch, i);

	for (i = 0; i < addresses.count; ++i) {
	/* Write test pattern. */
	unsigned int offset;
	for (offset = 0; offset < 320; offset += 4)
	write32(addresses.addr[i] + offset,
	read_training_schedule[offset >> 3]);
	}

	for (i = 0; i < 8; ++i)
	read_training_per_lane(ch, i, &addresses);
	}
	}
	static void read_training_store_results(void)
	{
	u8 bytes[TOTAL_CHANNELS * 8];
	int ch, i;

	/* Store one timing pair in one byte each. */
	FOR_EACH_CHANNEL(ch) {
	for (i = 0; i < 8; ++i) {
	const u32 bl_reg = MCHBAR32(CxRDTy_MCHBAR(ch, i));
	bytes[(ch * 8) + i] =
	(((bl_reg & CxRDTy_T_MASK) >> CxRDTy_T_SHIFT)
	<< 4) \|
	((bl_reg & CxRDTy_P_MASK) >> CxRDTy_P_SHIFT);
	}
	}

	/* Store everything in CMOS above 128 bytes. */
	for (i = 0; i < (TOTAL_CHANNELS * 8); ++i)
	cmos_write(bytes[i], CMOS_READ_TRAINING + i);
	}
	static void read_training_restore_results(void)
	{
	u8 bytes[TOTAL_CHANNELS * 8];
	int ch, i;

	/* Read from CMOS. */
	for (i = 0; i < (TOTAL_CHANNELS * 8); ++i)
	bytes[i] = cmos_read(CMOS_READ_TRAINING + i);

	/* Program restored results. */
	FOR_EACH_CHANNEL(ch) {
	for (i = 0; i < 8; ++i) {
	const int t = bytes[(ch * 8) + i] >> 4;
	const int p = bytes[(ch * 8) + i] & 7;
	u32 bl_reg = MCHBAR32(CxRDTy_MCHBAR(ch, i));
	bl_reg &= ~(CxRDTy_T_MASK \| CxRDTy_P_MASK);
	bl_reg \|= (3 << 25) \| CxRDTy_T(t) \| CxRDTy_P(p);
	MCHBAR32(CxRDTy_MCHBAR(ch, i)) = bl_reg;
	printk(BIOS_DEBUG, "Restored timings for byte lane "
	"%d on channel %d: %d.%d\n", i, ch, t, p);
	}
	}
	}
	void raminit_read_training(const dimminfo_t *const dimms, const int s3resume)
	{
	if (!s3resume) {
	perform_read_training(dimms);
	read_training_store_results();
	} else {
	read_training_restore_results();
	}
	raminit_reset_readwrite_pointers();
	}

	/* Write Training. */
	#define CxWRTy_T_SHIFT 28
	#define CxWRTy_T_MASK (0xf << CxWRTy_T_SHIFT)
	#define CxWRTy_T(t) ((t << CxWRTy_T_SHIFT) & CxWRTy_T_MASK)
	#define CxWRTy_P_SHIFT 24
	#define CxWRTy_P_MASK (0x7 << CxWRTy_P_SHIFT)
	#define CxWRTy_P(p) ((p << CxWRTy_P_SHIFT) & CxWRTy_P_MASK)
	#define CxWRTy_F_SHIFT 18
	#define CxWRTy_F_MASK (0x3 << CxWRTy_F_SHIFT)
	#define CxWRTy_F(f) ((f << CxWRTy_F_SHIFT) & CxWRTy_F_MASK)
	#define CxWRTy_D_SHIFT 16
	#define CxWRTy_D_MASK (0x3 << CxWRTy_D_SHIFT)
	#define CxWRTy_BELOW_D (0x3 << CxWRTy_D_SHIFT)
	#define CxWRTy_ABOVE_D (0x1 << CxWRTy_D_SHIFT)
	static const u32 write_training_schedule[] = {
	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
	0xffffffff, 0x00000000, 0xffffffff, 0x00000000,
	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
	0xefefefef, 0x10101010, 0xefefefef, 0x10101010,
	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
	0xefefefef, 0xeeeeeeee, 0x11111111, 0x10101010,
	0x03030303, 0x04040404, 0x09090909, 0x10101010,
	0x21212121, 0x40404040, 0x81818181, 0x00000000,
	0x03030303, 0x04040404, 0x09090909, 0x10101010,
	0x21212121, 0x40404040, 0x81818181, 0x00000000,
	0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
	0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
	0xfdfdfdfd, 0xfafafafa, 0xf7f7f7f7, 0xeeeeeeee,
	0xdfdfdfdf, 0xbebebebe, 0x7f7f7f7f, 0xfefefefe,
	};
	/* for raw card types A, B and C: MEM_CLOCK_1067MT? X group X lower/upper */
	static const u32 write_training_bytelane_masks_abc[2][4][2] = {
	{ /* clock < MEM_CLOCK_1067MT */
	{ 0xffffffff, 0x00000000 }, { 0x00000000, 0x00000000 },
	{ 0x00000000, 0xffffffff }, { 0x00000000, 0x00000000 },
	},
	{ /* clock == MEM_CLOCK_1067MT */
	{ 0x0000ffff, 0x00000000 }, { 0xffff0000, 0x00000000 },
	{ 0x00000000, 0x0000ffff }, { 0x00000000, 0xffff0000 },
	},
	};
	/* for raw card type F: group X lower/upper */
	static const u32 write_training_bytelane_masks_f[4][2] = {
	{ 0xff00ff00, 0x00000000 }, { 0x00ff00ff, 0x00000000 },
	{ 0x00000000, 0xff00ff00 }, { 0x00000000, 0x00ff00ff },
	};
	#define WRITE_TIMING_P_SHIFT 3
	#define WRITE_TIMING_P_BOUND (1 << WRITE_TIMING_P_SHIFT)
	#define WRITE_TIMING_F_BOUND 4
	typedef struct {
	int f;
	int t;
	const int t_bound;
	int p;
	} write_timing_t;
	static void normalize_write_timing(write_timing_t *const timing)
	{
	while (timing->p >= WRITE_TIMING_P_BOUND) {
	timing->t++;
	timing->p -= WRITE_TIMING_P_BOUND;
	}
	while (timing->p < 0) {
	timing->t--;
	timing->p += WRITE_TIMING_P_BOUND;
	}
	while (timing->t >= timing->t_bound) {
	timing->f++;
	timing->t -= timing->t_bound;
	}
	while (timing->t < 0) {
	timing->f--;
	timing->t += timing->t_bound;
	}
	if ((timing->f < 0) \|\| (timing->f >= WRITE_TIMING_F_BOUND))
	die("Timing under-/overflow during write training.\n");
	}
	static void program_write_timing(const int ch, const int group,
	write_timing_t *const timing, int memclk1067)
	{
	/* MEM_CLOCK_1067MT? X lower/upper */
	const u32 d_bounds[2][2] = { { 1, 6 }, { 2, 9 } };

	normalize_write_timing(timing);

	const int f = timing->f;
	const int t = timing->t;
	const int p = (memclk1067 && (((t == 9) && (timing->p >= 4)) \|\|
	((t == 10) && (timing->p < 4))))
	? 4 : timing->p;
	const int d =
	(t <= d_bounds[memclk1067][0]) ? CxWRTy_BELOW_D :
	((t > d_bounds[memclk1067][1]) ? CxWRTy_ABOVE_D : 0);

	u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, group));
	reg &= ~(CxWRTy_T_MASK \| CxWRTy_P_MASK \| CxWRTy_F_MASK);
	reg &= ~CxWRTy_D_MASK;
	reg \|= CxWRTy_T(t) \| CxWRTy_P(p) \| CxWRTy_F(f) \| d;
	MCHBAR32(CxWRTy_MCHBAR(ch, group)) = reg;
	}
	/* Returns 1 on success, 0 on failure. */
	static int write_training_test(const address_bunch_t *const addresses,
	const u32 *const masks)
	{
	int i, ret = 0;

	const u32 mmarb0 = MCHBAR32(0x0220);
	const u8 wrcctl = MCHBAR8(0x0218);
	MCHBAR32(0x0220) \|= 0xf << 28;
	MCHBAR8(0x0218) \|= 0x1 << 4;

	for (i = 0; i < addresses->count; ++i) {
	const unsigned int addr = addresses->addr[i];
	unsigned int off;
	for (off = 0; off < 640; off += 8) {
	const u32 pattern = write_training_schedule[off >> 3];
	write32(addr + off, pattern);
	write32(addr + off + 4, pattern);
	}

	MCHBAR8(0x78) \|= 1;

	for (off = 0; off < 640; off += 8) {
	const u32 good = write_training_schedule[off >> 3];
	const u32 read1 = read32(addr + off);
	if ((read1 & masks[0]) != (good & masks[0]))
	goto _bad_timing_out;
	const u32 read2 = read32(addr + off + 4);
	if ((read2 & masks[1]) != (good & masks[1]))
	goto _bad_timing_out;
	}
	}
	ret = 1;

	_bad_timing_out:
	MCHBAR32(0x0220) = mmarb0;
	MCHBAR8(0x0218) = wrcctl;

	return ret;
	}
	static void write_training_per_group(const int ch, const int group,
	const address_bunch_t *const addresses,
	const u32 masks[][2], const int memclk1067)
	{
	const int t_bound = memclk1067 ? 12 : 11;
	write_timing_t lower = { 0, 0, t_bound, 0 },
	upper = { 0, 0, t_bound, 0 };

	/* Search lower bound. */
	const u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, group));
	lower.t = (reg >> 12) & 0xf;
	lower.p = (reg >> 8) & 0x7;
	lower.f = ((reg >> 2) & 0x3) - 1;
	program_write_timing(ch, group, &lower, memclk1067);
	/* Coarse search for good t. */
	while (!write_training_test(addresses, masks[group])) {
	++lower.t;
	program_write_timing(ch, group, &lower, memclk1067);
	}
	/* Fine search for good p. */
	--lower.t;
	program_write_timing(ch, group, &lower, memclk1067);
	while (!write_training_test(addresses, masks[group])) {
	++lower.p;
	program_write_timing(ch, group, &lower, memclk1067);
	}

	/* Search upper bound. */
	upper.t = lower.t + 3;
	upper.p = lower.p;
	upper.f = lower.f;
	program_write_timing(ch, group, &upper, memclk1067);
	if (!write_training_test(addresses, masks[group]))
	die("Write training failed; limits too narrow.\n");
	/* Coarse search for good t. */
	while (write_training_test(addresses, masks[group])) {
	++upper.t;
	program_write_timing(ch, group, &upper, memclk1067);
	}
	/* Fine search for good p. */
	--upper.t;
	program_write_timing(ch, group, &upper, memclk1067);
	while (write_training_test(addresses, masks[group])) {
	++upper.p;
	program_write_timing(ch, group, &upper, memclk1067);
	}

	/* Calculate and program mean value. */
	lower.t += lower.f * lower.t_bound;
	lower.p += lower.t << WRITE_TIMING_P_SHIFT;
	upper.t += upper.f * upper.t_bound;
	upper.p += upper.t << WRITE_TIMING_P_SHIFT;
	/* lower becomes the mean value. */
	const int mean_p = (lower.p + upper.p) >> 1;
	lower.f = mean_p / (lower.t_bound << WRITE_TIMING_P_SHIFT);
	lower.t = (mean_p >> WRITE_TIMING_P_SHIFT) % lower.t_bound;
	lower.p = mean_p & (WRITE_TIMING_P_BOUND - 1);
	program_write_timing(ch, group, &lower, memclk1067);

	printk(BIOS_DEBUG, "Final timings for group %d"
	" on channel %d: %d.%d.%d\n",
	group, ch, lower.f, lower.t, lower.p);
	}
	static void perform_write_training(const int memclk1067,
	const dimminfo_t *const dimms)
	{
	const int cardF[] = { dimms[0].card_type == 0xf,
	dimms[1].card_type == 0xf };
	int ch, r, group;

	address_bunch_t addr[2] = { { { 0, }, 0 }, { { 0, }, 0 }, };
	/* Add check if channel A is populated, i.e. if cardF[0] is valid.
	* Otherwise we would write channel A registers when DIMM in channel B
	* is of raw card type A, B or C (cardF[1] == 0) even if channel A is
	* not populated.
	* Needs raw card type A, B or C for testing. */
	if ((dimms[0].card_type != 0) && (cardF[0] == cardF[1])) {
	/* Common path for both channels. */
	FOR_EACH_POPULATED_RANK(dimms, ch, r)
	addr[0].addr[addr[0].count++] =
	raminit_get_rank_addr(ch, r);
	} else {
	FOR_EACH_POPULATED_RANK(dimms, ch, r)
	addr[ch].addr[addr[ch].count++] =
	raminit_get_rank_addr(ch, r);
	}

	FOR_EACH_CHANNEL(ch) if (addr[ch].count > 0) {
	const u32 (*const masks)[2] = (!cardF[ch])
	? write_training_bytelane_masks_abc[memclk1067]
	: write_training_bytelane_masks_f;
	for (group = 0; group < 4; ++group) {
	if (!masks[group][0] && !masks[group][1])
	continue;
	write_training_per_group(
	ch, group, &addr[ch], masks, memclk1067);
	}
	}
	}
	static void write_training_store_results(void)
	{
	u8 bytes[TOTAL_CHANNELS * 4 * 2]; /* two bytes per group */
	int ch, i;

	/* Store one T/P pair in one, F in the other byte. */
	/* We could save six bytes by putting all F values in two bytes. */
	FOR_EACH_CHANNEL(ch) {
	for (i = 0; i < 4; ++i) {
	const u32 reg = MCHBAR32(CxWRTy_MCHBAR(ch, i));
	bytes[(ch * 8) + (i * 2)] =
	(((reg & CxWRTy_T_MASK)
	>> CxWRTy_T_SHIFT) << 4) \|
	((reg & CxWRTy_P_MASK) >> CxWRTy_P_SHIFT);
	bytes[(ch * 8) + (i * 2) + 1] =
	((reg & CxWRTy_F_MASK) >> CxWRTy_F_SHIFT);
	}
	}

	/* Store everything in CMOS above 128 bytes. */
	for (i = 0; i < (TOTAL_CHANNELS * 4 * 2); ++i)
	cmos_write(bytes[i], CMOS_WRITE_TRAINING + i);
	}
	static void write_training_restore_results(const int memclk1067)
	{
	const int t_bound = memclk1067 ? 12 : 11;

	u8 bytes[TOTAL_CHANNELS * 4 * 2]; /* two bytes per group */
	int ch, i;

	/* Read from CMOS. */
	for (i = 0; i < (TOTAL_CHANNELS * 4 * 2); ++i)
	bytes[i] = cmos_read(CMOS_WRITE_TRAINING + i);

	/* Program with original program_write_timing(). */
	FOR_EACH_CHANNEL(ch) {
	for (i = 0; i < 4; ++i) {
	write_timing_t timing = { 0, 0, t_bound, 0 };
	timing.f = bytes[(ch * 8) + (i * 2) + 1] & 3;
	timing.t = bytes[(ch * 8) + (i * 2)] >> 4;
	timing.p = bytes[(ch * 8) + (i * 2)] & 7;
	program_write_timing(ch, i, &timing, memclk1067);
	printk(BIOS_DEBUG, "Restored timings for group %d "
	"on channel %d: %d.%d.%d\n",
	i, ch, timing.f, timing.t, timing.p);
	}
	}
	}
	void raminit_write_training(const mem_clock_t ddr3clock,
	const dimminfo_t *const dimms,
	const int s3resume)
	{
	const int memclk1067 = ddr3clock == MEM_CLOCK_1067MT;

	if (!s3resume) {
	perform_write_training(memclk1067, dimms);
	write_training_store_results();
	} else {
	write_training_restore_results(memclk1067);
	}
	raminit_reset_readwrite_pointers();
	}