[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

[Need HELP!] Please help to test kernel patch (sparc64, NUMA)



Hi,

I'm looking for a person who has sparc64 machine with NUMA. The patch below adds
NUMA kernel text replication support. This should improve sparc64 kernel performance
a little bit.

I tested it on my machines, and it looks working for me. But they are not standard
sun v9. So person with standard vanila-supported machine is seeked!

Is anybody able to help me?

It's necessary to 1)clone David Miller's git tree:

git clone --depth=1 git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc.git

2)apply the patch and 3)do not forget to enable CONFIG_NUMA in xconfig/menuconfig.

The following actions is to do a boot test. If everything is OK, I'll be very thankful
if you're able to execute any short performance test: before patch and with it.

Thanks!

Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
---
 arch/sparc/include/asm/page_64.h    |    3 +
 arch/sparc/include/asm/pgtable_64.h |    6 ++
 arch/sparc/include/asm/trap_block.h |   17 ++++++
 arch/sparc/kernel/smp_64.c          |    8 ++-
 arch/sparc/kernel/trampoline_64.S   |   46 ++++++++++++++---
 arch/sparc/mm/init_64.c             |   94 ++++++++++++++++++++++++++++++++++-
 arch/sparc/mm/init_64.h             |    2 +-
 7 files changed, 163 insertions(+), 13 deletions(-)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index aac53fc..5a85352 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -8,6 +8,9 @@
 #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK    (~(PAGE_SIZE-1))
 
+#define PAGE4MB_SHIFT		22
+#define PAGE4MB_SIZE		(_AC(1,UL) << PAGE4MB_SHIFT)
+
 /* Flushing for D-cache alias handling is only needed if
  * the page size is smaller than 16K.
  */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 8358dc1..0b0495f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -884,6 +884,12 @@ extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
 extern void paging_init(void);
 extern unsigned long find_ecache_flush_span(unsigned long size);
 
+#ifdef CONFIG_NUMA
+extern void numa_copy_kernel_text(void);
+#else
+static inline void numa_copy_kernel_text(void) {}
+#endif
+
 struct seq_file;
 extern void mmu_info(struct seq_file *);
 
diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
index 7e26b2d..a2f0990 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -138,6 +138,23 @@ extern struct sun4v_2insn_patch_entry __sun4v_2insn_patch,
 	nop;						\
 	.previous;
 
+#ifdef CONFIG_NUMA
+
+#define __GET_NODEID(REG, TMP)				\
+	__GET_CPUID(REG)				\
+	sethi	%hi(numa_cpu_lookup_table), TMP;	\
+	or	TMP, %lo(numa_cpu_lookup_table), TMP;	\
+	sllx	REG, 2, REG;				\
+	add	TMP, REG, TMP;				\
+	lduw	[TMP], REG;
+
+#else /* !CONFIG_NUMA */
+
+#define __GET_NODEID(REG, TMP)				\
+	clr	REG
+
+#endif /* !CONFIG_NUMA */
+
 #ifdef CONFIG_SMP
 
 #define TRAP_LOAD_TRAP_BLOCK(DEST, TMP)		\
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b66a533..554a0c5 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -285,7 +285,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
 				void **descrp)
 {
 	extern unsigned long sparc64_ttable_tl0;
-	extern unsigned long kern_locked_tte_data;
+	extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
 	struct hvtramp_descr *hdesc;
 	unsigned long trampoline_ra;
 	struct trap_per_cpu *tb;
@@ -315,7 +315,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
 	hdesc->thread_reg = thread_reg;
 
 	tte_vaddr = (unsigned long) KERNBASE;
-	tte_data = kern_locked_tte_data;
+	tte_data = kern_locked_tte_data[0];
 
 	for (i = 0; i < hdesc->num_mappings; i++) {
 		hdesc->maps[i].vaddr = tte_vaddr;
@@ -1214,6 +1214,10 @@ int setup_profiling_timer(unsigned int multiplier)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	/* Dublicate kernel on every node. Do this after
+	 * all kernel patches are applied.
+	 */
+	numa_copy_kernel_text();
 }
 
 void smp_prepare_boot_cpu(void)
diff --git a/arch/sparc/kernel/trampoline_64.S b/arch/sparc/kernel/trampoline_64.S
index ad4bde3..e5a4f85 100644
--- a/arch/sparc/kernel/trampoline_64.S
+++ b/arch/sparc/kernel/trampoline_64.S
@@ -117,26 +117,42 @@ startup_continue:
 	flushw
 
 	/* Setup the loop variables:
+	 * %l1: Number of 4MB pages containing not-init kernel text
+	 * %l2: TTE base of node 0. Used for DTLB and for rest of __init text
+	 *	ITLB mappings. See numa_alloc_kernel_text() for details.
 	 * %l3: VADDR base
-	 * %l4: TTE base
+	 * %l4: TTE base of current node. Used for ITLB.
 	 * %l5: Loop iterator, iterates from 0 to 'num_kernel_image_mappings'
 	 * %l6: Number of TTE entries to map
 	 * %l7: Highest TTE entry number, we count down
 	 */
 	sethi		%hi(KERNBASE), %l3
 	sethi		%hi(kern_locked_tte_data), %l4
-	ldx		[%l4 + %lo(kern_locked_tte_data)], %l4
+	or		%l4, %lo(kern_locked_tte_data), %l4
+	ldx		[%l4], %l2	! kern_locked_tte_data[0]
+
+	__GET_NODEID(%g2, %g1)
+	sllx		%g2, 3, %g2
+	add		%l4, %g2, %l4
+	ldx		[%l4], %l4	! kern_locked_tte_data[node]
+
 	clr		%l5
 	sethi		%hi(num_kernel_image_mappings), %l6
 	lduw		[%l6 + %lo(num_kernel_image_mappings)], %l6
 
+	sethi		%hi(num_node_copy_mappings), %l1
+	lduw		[%l1 + %lo(num_node_copy_mappings)], %l1
+
 	mov		15, %l7
 	BRANCH_IF_ANY_CHEETAH(g1,g5,2f)
 
 	mov		63, %l7
 2:
-
-3:
+	cmp		%l5, %l1	!__init section
+	bne		4f
+	 nop
+	mov		%l2, %l4	!use node 0 TTE
+4:
 	/* Lock into I-MMU */
 	sethi		%hi(call_method), %g2
 	or		%g2, %lo(call_method), %g2
@@ -190,7 +206,7 @@ startup_continue:
 
 	add		%l3, %g1, %g2
 	stx		%g2, [%sp + 2047 + 128 + 0x28]	! VADDR
-	add		%l4, %g1, %g2
+	add		%l2, %g1, %g2
 	stx		%g2, [%sp + 2047 + 128 + 0x30]	! TTE
 
 	/* TTE index is highest minus loop index.  */
@@ -205,7 +221,7 @@ startup_continue:
 
 	add		%l5, 1, %l5
 	cmp		%l5, %l6
-	bne,pt		%xcc, 3b
+	bne,pt		%xcc, 2b
 	 nop
 
 	sethi		%hi(prom_entry_lock), %g2
@@ -217,12 +233,26 @@ startup_continue:
 niagara_lock_tlb:
 	sethi		%hi(KERNBASE), %l3
 	sethi		%hi(kern_locked_tte_data), %l4
-	ldx		[%l4 + %lo(kern_locked_tte_data)], %l4
+	or		%l4, %lo(kern_locked_tte_data), %l4
+	ldx		[%l4], %l2	! kern_locked_tte_data[0]
+
+	__GET_NODEID(%g2, %g1)
+	sllx		%g2, 3, %g2
+	add		%l4, %g2, %l4
+	ldx		[%l4], %l4	! kern_locked_tte_data[node]
+
 	clr		%l5
 	sethi		%hi(num_kernel_image_mappings), %l6
 	lduw		[%l6 + %lo(num_kernel_image_mappings)], %l6
 
+	sethi		%hi(num_node_copy_mappings), %l1
+	lduw		[%l1 + %lo(num_node_copy_mappings)], %l1
 1:
+	cmp		%l5, %l1	!__init section
+	bne		4f
+	 nop
+	mov		%l2, %l4	!use node 0 TTE
+4:
 	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
 	sllx		%l5, 22, %g2
 	add		%l3, %g2, %o0
@@ -235,7 +265,7 @@ niagara_lock_tlb:
 	sllx		%l5, 22, %g2
 	add		%l3, %g2, %o0
 	clr		%o1
-	add		%l4, %g2, %o2
+	add		%l2, %g2, %o2
 	mov		HV_MMU_DMMU, %o3
 	ta		HV_FAST_TRAP
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e53..0183213 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -186,6 +186,7 @@ unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
 unsigned long sparc64_kern_sec_context __read_mostly;
 
 int num_kernel_image_mappings;
+int num_node_copy_mappings;
 
 #ifdef CONFIG_DEBUG_DCFLUSH
 atomic_t dcpage_flushes = ATOMIC_INIT(0);
@@ -477,7 +478,7 @@ void mmu_info(struct seq_file *m)
 struct linux_prom_translation prom_trans[512] __read_mostly;
 unsigned int prom_trans_ents __read_mostly;
 
-unsigned long kern_locked_tte_data;
+unsigned long kern_locked_tte_data[MAX_NUMNODES];
 
 /* The obp translations are saved based on 8k pagesize, since obp can
  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
@@ -591,7 +592,7 @@ static void __init remap_kernel(void)
 	phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
 	tte_data = kern_large_tte(phys_page);
 
-	kern_locked_tte_data = tte_data;
+	kern_locked_tte_data[0] = tte_data;
 
 	/* Now lock us into the TLBs via Hypervisor or OBP. */
 	if (tlb_type == hypervisor) {
@@ -1330,6 +1331,79 @@ static void __init bootmem_init_nonnuma(void)
 	node_set_online(0);
 }
 
+#ifdef CONFIG_NUMA
+
+/* Allocate memory for per-node copy of kernel text.
+ * The copying itself will be made after all kernel
+ * patches are applied.
+ */
+static void __init numa_alloc_kernel_text(void)
+{
+	unsigned long init_start = (unsigned long)__init_begin;
+	unsigned int size, node;
+
+	/* The rest init text will be mapped from the original image.
+	 */
+	size = round_up(init_start - KERNBASE, PAGE4MB_SIZE);
+	num_node_copy_mappings = size >> PAGE4MB_SHIFT;
+
+	for (node = 1; node < num_node_masks; node++) {
+		unsigned long tte_data;
+		phys_addr_t new_base_pa;
+
+		new_base_pa = memblock_alloc_nid(size, PAGE4MB_SIZE, node);
+
+		if (new_base_pa) {
+			pr_info("node %d: Allocated memory for copy of "
+				"kernel text: [%016llx, %016llx]\n",
+				 node, new_base_pa, new_base_pa + size);
+			tte_data = kern_large_tte(new_base_pa);
+		} else {
+			pr_err("node %d: Can't allocate memory for kernel "
+			       "text duplicate\n", node);
+			tte_data = kern_locked_tte_data[0];
+		}
+
+		kern_locked_tte_data[node] = tte_data;
+	}
+}
+
+/* Dublicate kernel text on every NUMA node.
+ * Do not copy pages which contain only init text,
+ * because they are mapped from original kernel.
+ */
+void numa_copy_kernel_text(void)
+{
+	unsigned int size, node;
+	unsigned long tte_data0;
+
+	size = num_node_copy_mappings << PAGE4MB_SHIFT;
+	tte_data0 = kern_locked_tte_data[0];
+
+	for (node = 1; node < num_node_masks; node++) {
+		unsigned long tte_data, phys_addr;
+
+		tte_data = kern_locked_tte_data[node];
+
+		if (tte_data == tte_data0)
+			continue;
+
+		/* PA is [42:12] range */
+		phys_addr = (((tte_data << 21) >> 21) >> 13) << 13;
+
+		memcpy(__va(phys_addr), (void *)KERNBASE, size);
+	}
+}
+
+#else /* CONFIG_NUMA */
+
+static void __init numa_alloc_kernel_text(void)
+{
+}
+
+#endif /* CONFIG_NUMA */
+
+
 static unsigned long __init bootmem_init(unsigned long phys_base)
 {
 	unsigned long end_pfn;
@@ -1341,6 +1415,8 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
 	if (bootmem_init_numa() < 0)
 		bootmem_init_nonnuma();
 
+	numa_alloc_kernel_text();
+
 	/* Dump memblock with node info. */
 	memblock_dump_all();
 
@@ -1922,6 +1998,9 @@ void __init paging_init(void)
 		memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
 	}
 
+#ifdef CONFIG_NUMA
+	kern_size = round_up(kern_size, PAGE4MB_SIZE);
+#endif
 	memblock_reserve(kern_base, kern_size);
 
 	find_ramdisk(phys_base);
@@ -2188,6 +2267,17 @@ void free_initmem(void)
 	 * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes.
 	 */
 	addr = PAGE_ALIGN((unsigned long)(__init_begin));
+
+#ifdef CONFIG_NUMA
+	if (num_node_masks > 1) {
+		/* Do not free 4KB pages which are lying at 4MB page
+		 * together with normal kernel text. Their addresses
+		 * are forbidden forever.
+		 */
+		addr = round_up(addr, PAGE4MB_SIZE);
+	}
+#endif
+
 	initend = (unsigned long)(__init_end) & PAGE_MASK;
 	for (; addr < initend; addr += PAGE_SIZE) {
 		unsigned long page;
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 5d3782de..a14c8d8 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -34,7 +34,7 @@ extern struct linux_prom_translation prom_trans[512];
 extern unsigned int prom_trans_ents;
 
 /* Exported for SMP bootup purposes. */
-extern unsigned long kern_locked_tte_data;
+extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
 
 extern void prom_world(int enter);
 


Reply to: