[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: [Need HELP!] Please help to test kernel patch (sparc64, NUMA)



A little performance test:

1) wget http://www.bitmover.com/lmbench/lmbench3.tar.gz
2) tar -zxvf lmbench3.tar.gz
3) cd lmbench3
4) make

   [You probably will have compilition error:
       "make[2]: *** No rule to make target `../SCCS/s.ChangeSet', needed by `bk.ver'.  Stop."

    In this case a)vim src/Makefile
                 b)remove "../SCCS/s.ChangeSet" target after "bk.ver:"
   ]

5)cd bin/<*sparc only directory*>/

  Now you are ready to execute tests (copy here attached file):

  $ sh short_test.sh > results.txt

Thanks,
          Kirill

04.12.2013, 13:56, "Kirill Tkhai" <tkhai@yandex.ru>:
> Hi,
>
> I'm looking for a person who has sparc64 machine with NUMA. The patch below adds
> NUMA kernel text replication support. This should improve sparc64 kernel performance
> a little bit.
>
> I tested it on my machines, and it looks working for me. But they are not standard
> sun v9. So person with standard vanila-supported machine is seeked!
>
> Is anybody able to help me?
>
> It's necessary to 1)clone David Miller's git tree:
>
> git clone --depth=1 git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc.git
>
> 2)apply the patch and 3)do not forget to enable CONFIG_NUMA in xconfig/menuconfig.
>
> The following actions is to do a boot test. If everything is OK, I'll be very thankful
> if you're able to execute any short performance test: before patch and with it.
>
> Thanks!
>
> Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
> ---
>  arch/sparc/include/asm/page_64.h    |    3 +
>  arch/sparc/include/asm/pgtable_64.h |    6 ++
>  arch/sparc/include/asm/trap_block.h |   17 ++++++
>  arch/sparc/kernel/smp_64.c          |    8 ++-
>  arch/sparc/kernel/trampoline_64.S   |   46 ++++++++++++++---
>  arch/sparc/mm/init_64.c             |   94 ++++++++++++++++++++++++++++++++++-
>  arch/sparc/mm/init_64.h             |    2 +-
>  7 files changed, 163 insertions(+), 13 deletions(-)
> diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
> index aac53fc..5a85352 100644
> --- a/arch/sparc/include/asm/page_64.h
> +++ b/arch/sparc/include/asm/page_64.h
> @@ -8,6 +8,9 @@
>  #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
>  #define PAGE_MASK    (~(PAGE_SIZE-1))
>
> +#define PAGE4MB_SHIFT 22
> +#define PAGE4MB_SIZE (_AC(1,UL) << PAGE4MB_SHIFT)
> +
>  /* Flushing for D-cache alias handling is only needed if
>   * the page size is smaller than 16K.
>   */
> diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> index 8358dc1..0b0495f 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -884,6 +884,12 @@ extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
>  extern void paging_init(void);
>  extern unsigned long find_ecache_flush_span(unsigned long size);
>
> +#ifdef CONFIG_NUMA
> +extern void numa_copy_kernel_text(void);
> +#else
> +static inline void numa_copy_kernel_text(void) {}
> +#endif
> +
>  struct seq_file;
>  extern void mmu_info(struct seq_file *);
>
> diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
> index 7e26b2d..a2f0990 100644
> --- a/arch/sparc/include/asm/trap_block.h
> +++ b/arch/sparc/include/asm/trap_block.h
> @@ -138,6 +138,23 @@ extern struct sun4v_2insn_patch_entry __sun4v_2insn_patch,
>          nop; \
>          .previous;
>
> +#ifdef CONFIG_NUMA
> +
> +#define __GET_NODEID(REG, TMP) \
> + __GET_CPUID(REG) \
> + sethi %hi(numa_cpu_lookup_table), TMP; \
> + or TMP, %lo(numa_cpu_lookup_table), TMP; \
> + sllx REG, 2, REG; \
> + add TMP, REG, TMP; \
> + lduw [TMP], REG;
> +
> +#else /* !CONFIG_NUMA */
> +
> +#define __GET_NODEID(REG, TMP) \
> + clr REG
> +
> +#endif /* !CONFIG_NUMA */
> +
>  #ifdef CONFIG_SMP
>
>  #define TRAP_LOAD_TRAP_BLOCK(DEST, TMP) \
> diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
> index b66a533..554a0c5 100644
> --- a/arch/sparc/kernel/smp_64.c
> +++ b/arch/sparc/kernel/smp_64.c
> @@ -285,7 +285,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
>                                  void **descrp)
>  {
>          extern unsigned long sparc64_ttable_tl0;
> - extern unsigned long kern_locked_tte_data;
> + extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
>          struct hvtramp_descr *hdesc;
>          unsigned long trampoline_ra;
>          struct trap_per_cpu *tb;
> @@ -315,7 +315,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
>          hdesc->thread_reg = thread_reg;
>
>          tte_vaddr = (unsigned long) KERNBASE;
> - tte_data = kern_locked_tte_data;
> + tte_data = kern_locked_tte_data[0];
>
>          for (i = 0; i < hdesc->num_mappings; i++) {
>                  hdesc->maps[i].vaddr = tte_vaddr;
> @@ -1214,6 +1214,10 @@ int setup_profiling_timer(unsigned int multiplier)
>
>  void __init smp_prepare_cpus(unsigned int max_cpus)
>  {
> + /* Dublicate kernel on every node. Do this after
> + * all kernel patches are applied.
> + */
> + numa_copy_kernel_text();
>  }
>
>  void smp_prepare_boot_cpu(void)
> diff --git a/arch/sparc/kernel/trampoline_64.S b/arch/sparc/kernel/trampoline_64.S
> index ad4bde3..e5a4f85 100644
> --- a/arch/sparc/kernel/trampoline_64.S
> +++ b/arch/sparc/kernel/trampoline_64.S
> @@ -117,26 +117,42 @@ startup_continue:
>          flushw
>
>          /* Setup the loop variables:
> + * %l1: Number of 4MB pages containing not-init kernel text
> + * %l2: TTE base of node 0. Used for DTLB and for rest of __init text
> + * ITLB mappings. See numa_alloc_kernel_text() for details.
>           * %l3: VADDR base
> - * %l4: TTE base
> + * %l4: TTE base of current node. Used for ITLB.
>           * %l5: Loop iterator, iterates from 0 to 'num_kernel_image_mappings'
>           * %l6: Number of TTE entries to map
>           * %l7: Highest TTE entry number, we count down
>           */
>          sethi %hi(KERNBASE), %l3
>          sethi %hi(kern_locked_tte_data), %l4
> - ldx [%l4 + %lo(kern_locked_tte_data)], %l4
> + or %l4, %lo(kern_locked_tte_data), %l4
> + ldx [%l4], %l2 ! kern_locked_tte_data[0]
> +
> + __GET_NODEID(%g2, %g1)
> + sllx %g2, 3, %g2
> + add %l4, %g2, %l4
> + ldx [%l4], %l4 ! kern_locked_tte_data[node]
> +
>          clr %l5
>          sethi %hi(num_kernel_image_mappings), %l6
>          lduw [%l6 + %lo(num_kernel_image_mappings)], %l6
>
> + sethi %hi(num_node_copy_mappings), %l1
> + lduw [%l1 + %lo(num_node_copy_mappings)], %l1
> +
>          mov 15, %l7
>          BRANCH_IF_ANY_CHEETAH(g1,g5,2f)
>
>          mov 63, %l7
>  2:
> -
> -3:
> + cmp %l5, %l1 !__init section
> + bne 4f
> + nop
> + mov %l2, %l4 !use node 0 TTE
> +4:
>          /* Lock into I-MMU */
>          sethi %hi(call_method), %g2
>          or %g2, %lo(call_method), %g2
> @@ -190,7 +206,7 @@ startup_continue:
>
>          add %l3, %g1, %g2
>          stx %g2, [%sp + 2047 + 128 + 0x28] ! VADDR
> - add %l4, %g1, %g2
> + add %l2, %g1, %g2
>          stx %g2, [%sp + 2047 + 128 + 0x30] ! TTE
>
>          /* TTE index is highest minus loop index.  */
> @@ -205,7 +221,7 @@ startup_continue:
>
>          add %l5, 1, %l5
>          cmp %l5, %l6
> - bne,pt %xcc, 3b
> + bne,pt %xcc, 2b
>           nop
>
>          sethi %hi(prom_entry_lock), %g2
> @@ -217,12 +233,26 @@ startup_continue:
>  niagara_lock_tlb:
>          sethi %hi(KERNBASE), %l3
>          sethi %hi(kern_locked_tte_data), %l4
> - ldx [%l4 + %lo(kern_locked_tte_data)], %l4
> + or %l4, %lo(kern_locked_tte_data), %l4
> + ldx [%l4], %l2 ! kern_locked_tte_data[0]
> +
> + __GET_NODEID(%g2, %g1)
> + sllx %g2, 3, %g2
> + add %l4, %g2, %l4
> + ldx [%l4], %l4 ! kern_locked_tte_data[node]
> +
>          clr %l5
>          sethi %hi(num_kernel_image_mappings), %l6
>          lduw [%l6 + %lo(num_kernel_image_mappings)], %l6
>
> + sethi %hi(num_node_copy_mappings), %l1
> + lduw [%l1 + %lo(num_node_copy_mappings)], %l1
>  1:
> + cmp %l5, %l1 !__init section
> + bne 4f
> + nop
> + mov %l2, %l4 !use node 0 TTE
> +4:
>          mov HV_FAST_MMU_MAP_PERM_ADDR, %o5
>          sllx %l5, 22, %g2
>          add %l3, %g2, %o0
> @@ -235,7 +265,7 @@ niagara_lock_tlb:
>          sllx %l5, 22, %g2
>          add %l3, %g2, %o0
>          clr %o1
> - add %l4, %g2, %o2
> + add %l2, %g2, %o2
>          mov HV_MMU_DMMU, %o3
>          ta HV_FAST_TRAP
>
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 5322e53..0183213 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -186,6 +186,7 @@ unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
>  unsigned long sparc64_kern_sec_context __read_mostly;
>
>  int num_kernel_image_mappings;
> +int num_node_copy_mappings;
>
>  #ifdef CONFIG_DEBUG_DCFLUSH
>  atomic_t dcpage_flushes = ATOMIC_INIT(0);
> @@ -477,7 +478,7 @@ void mmu_info(struct seq_file *m)
>  struct linux_prom_translation prom_trans[512] __read_mostly;
>  unsigned int prom_trans_ents __read_mostly;
>
> -unsigned long kern_locked_tte_data;
> +unsigned long kern_locked_tte_data[MAX_NUMNODES];
>
>  /* The obp translations are saved based on 8k pagesize, since obp can
>   * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
> @@ -591,7 +592,7 @@ static void __init remap_kernel(void)
>          phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
>          tte_data = kern_large_tte(phys_page);
>
> - kern_locked_tte_data = tte_data;
> + kern_locked_tte_data[0] = tte_data;
>
>          /* Now lock us into the TLBs via Hypervisor or OBP. */
>          if (tlb_type == hypervisor) {
> @@ -1330,6 +1331,79 @@ static void __init bootmem_init_nonnuma(void)
>          node_set_online(0);
>  }
>
> +#ifdef CONFIG_NUMA
> +
> +/* Allocate memory for per-node copy of kernel text.
> + * The copying itself will be made after all kernel
> + * patches are applied.
> + */
> +static void __init numa_alloc_kernel_text(void)
> +{
> + unsigned long init_start = (unsigned long)__init_begin;
> + unsigned int size, node;
> +
> + /* The rest init text will be mapped from the original image.
> + */
> + size = round_up(init_start - KERNBASE, PAGE4MB_SIZE);
> + num_node_copy_mappings = size >> PAGE4MB_SHIFT;
> +
> + for (node = 1; node < num_node_masks; node++) {
> + unsigned long tte_data;
> + phys_addr_t new_base_pa;
> +
> + new_base_pa = memblock_alloc_nid(size, PAGE4MB_SIZE, node);
> +
> + if (new_base_pa) {
> + pr_info("node %d: Allocated memory for copy of "
> + "kernel text: [%016llx, %016llx]\n",
> + node, new_base_pa, new_base_pa + size);
> + tte_data = kern_large_tte(new_base_pa);
> + } else {
> + pr_err("node %d: Can't allocate memory for kernel "
> +       "text duplicate\n", node);
> + tte_data = kern_locked_tte_data[0];
> + }
> +
> + kern_locked_tte_data[node] = tte_data;
> + }
> +}
> +
> +/* Dublicate kernel text on every NUMA node.
> + * Do not copy pages which contain only init text,
> + * because they are mapped from original kernel.
> + */
> +void numa_copy_kernel_text(void)
> +{
> + unsigned int size, node;
> + unsigned long tte_data0;
> +
> + size = num_node_copy_mappings << PAGE4MB_SHIFT;
> + tte_data0 = kern_locked_tte_data[0];
> +
> + for (node = 1; node < num_node_masks; node++) {
> + unsigned long tte_data, phys_addr;
> +
> + tte_data = kern_locked_tte_data[node];
> +
> + if (tte_data == tte_data0)
> + continue;
> +
> + /* PA is [42:12] range */
> + phys_addr = (((tte_data << 21) >> 21) >> 13) << 13;
> +
> + memcpy(__va(phys_addr), (void *)KERNBASE, size);
> + }
> +}
> +
> +#else /* CONFIG_NUMA */
> +
> +static void __init numa_alloc_kernel_text(void)
> +{
> +}
> +
> +#endif /* CONFIG_NUMA */
> +
> +
>  static unsigned long __init bootmem_init(unsigned long phys_base)
>  {
>          unsigned long end_pfn;
> @@ -1341,6 +1415,8 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
>          if (bootmem_init_numa() < 0)
>                  bootmem_init_nonnuma();
>
> + numa_alloc_kernel_text();
> +
>          /* Dump memblock with node info. */
>          memblock_dump_all();
>
> @@ -1922,6 +1998,9 @@ void __init paging_init(void)
>                  memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
>          }
>
> +#ifdef CONFIG_NUMA
> + kern_size = round_up(kern_size, PAGE4MB_SIZE);
> +#endif
>          memblock_reserve(kern_base, kern_size);
>
>          find_ramdisk(phys_base);
> @@ -2188,6 +2267,17 @@ void free_initmem(void)
>           * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes.
>           */
>          addr = PAGE_ALIGN((unsigned long)(__init_begin));
> +
> +#ifdef CONFIG_NUMA
> + if (num_node_masks > 1) {
> + /* Do not free 4KB pages which are lying at 4MB page
> + * together with normal kernel text. Their addresses
> + * are forbidden forever.
> + */
> + addr = round_up(addr, PAGE4MB_SIZE);
> + }
> +#endif
> +
>          initend = (unsigned long)(__init_end) & PAGE_MASK;
>          for (; addr < initend; addr += PAGE_SIZE) {
>                  unsigned long page;
> diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
> index 5d3782de..a14c8d8 100644
> --- a/arch/sparc/mm/init_64.h
> +++ b/arch/sparc/mm/init_64.h
> @@ -34,7 +34,7 @@ extern struct linux_prom_translation prom_trans[512];
>  extern unsigned int prom_trans_ents;
>
>  /* Exported for SMP bootup purposes. */
> -extern unsigned long kern_locked_tte_data;
> +extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
>
>  extern void prom_world(int enter);
#!/bin/bash

CPU_NUM=`cat /proc/cpuinfo | grep online | wc -l`

./lat_proc -P $CPU_NUM -N4 procedure 2>&1
./lat_proc -P $CPU_NUM -N4 fork      2>&1
./lat_proc -P $CPU_NUM -N4 exec      2>&1
./bw_pipe  -P $CPU_NUM               2>&1

Reply to: