[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: [Need HELP!] Please help to test kernel patch (sparc64, NUMA)



Hi,

The system fails to boot with this patch:

[....] Setting preliminary keymap...[   79.970910] Unable to handle kernel pagin                                                                                                                                                             g request at virtual address 0000000001976000
[   80.075059] tsk->{mm,active_mm}->context = 00000000000002c6
[   80.148245] tsk->{mm,active_mm}->pgd = fffffc133bbdc000
[   80.216869]               \|/ ____ \|/
[   80.216869]               "@'/ .. \`@"
[   80.216869]               /_| \__/ |_\
[   80.216869]                  \__U_/
[   80.410178] gzip(449): Oops [#1]
[   80.452481] CPU: 1 PID: 449 Comm: gzip Not tainted 3.13.0-rc2-ghost-3-sparcnu                                                                                                                                                             ma-g1de425c-dirty #1
[   80.569145] task: fffffc133be1c980 ti: fffffc033cbd8000 task.ti: fffffc033cbd                                                                                                                                                             8000
[   80.667505] TSTATE: 0000000080e01600 TPC: 0000000000634858 TNPC: 000000000063                                                                                                                                                             485c Y: 00000000    Not tainted
[   80.796755] TPC: <__blk_segment_map_sg+0x58/0x140>
[   80.859643] g0: 0000004480e01601 g1: 0000010006789740 g2: 0000000000000000 g3                                                                                                                                                             : 0000000000000000
[   80.974023] g4: fffffc133be1c980 g5: fffffc133c9f6000 g6: fffffc033cbd8000 g7                                                                                                                                                             : 000000033c4ba000
[   81.088397] o0: 459442c846064700 o1: 0000000000000060 o2: 0000000000000000 o3                                                                                                                                                             : fffffc033df00bd0
[   81.202768] o4: 0000000000000000 o5: 0000000000001000 sp: fffffc033cbd9a41 re                                                                                                                                                             t_pc: 0000000000634838
[   81.321719] RPC: <__blk_segment_map_sg+0x38/0x140>
[   81.384617] l0: 0000000000001000 l1: 0000000000000005 l2: 0000000000000060 l3                                                                                                                                                             : 0000000000000be8
[   81.498994] l4: 000000000000007f l5: 0000000100000000 l6: fffffffe00000000 l7                                                                                                                                                             : 0000000000c14550
[   81.613369] i0: fffffc033c8c0000 i1: fffffc133bacbd00 i2: 459442c846064700 i3                                                                                                                                                             : fffffc033cbda3b8
[   81.727741] i4: fffffc033cbda3c0 i5: fffffc033cbda3c8 i6: fffffc033cbd9af1 i7                                                                                                                                                             : 0000000000634a7c
[   81.842116] I7: <blk_rq_map_sg+0x5c/0x1e0>
[   81.895861] Call Trace:
[   81.927887]  [0000000000634a7c] blk_rq_map_sg+0x5c/0x1e0
[   81.997660]  [00000000006e6ec8] scsi_init_sgtable+0x28/0x80
[   82.070854]  [00000000006e6f30] scsi_init_io+0x10/0x140
[   82.139485]  [0000000000700c30] sd_prep_fn+0x410/0x1020
[   82.208109]  [0000000000630160] blk_peek_request+0xe0/0x280
[   82.281300]  [00000000006e68e4] scsi_request_fn+0x44/0x580
[   82.353355]  [000000000062cbb4] __blk_run_queue+0x34/0x60
[   82.424273]  [0000000000648180] cfq_insert_request+0x2c0/0x4e0
[   82.500896]  [000000000062a1c8] __elv_add_request+0x1c8/0x2e0
[   82.576382]  [0000000000630734] blk_flush_plug_list+0x214/0x280
[   82.654163]  [0000000000816934] io_schedule+0x54/0xc0
[   82.720494]  [0000000000584604] sleep_on_buffer+0x4/0x20
[   82.790261]  [0000000000816fbc] __wait_on_bit+0x7c/0xe0
[   82.858883]  [0000000000817064] out_of_line_wait_on_bit+0x44/0x60
[   82.938944]  [0000000000586ce0] __bread+0xc0/0x100
[   83.001855]  [00000000005cb9a4] ext3_get_branch+0xa4/0x180
[   83.073901] Disabling lock debugging due to kernel taint
[   83.143672] Caller[0000000000634a7c]: blk_rq_map_sg+0x5c/0x1e0
[   83.220301] Caller[00000000006e6ec8]: scsi_init_sgtable+0x28/0x80
[   83.300363] Caller[00000000006e6f30]: scsi_init_io+0x10/0x140
[   83.375849] Caller[0000000000700c30]: sd_prep_fn+0x410/0x1020
[   83.451337] Caller[0000000000630160]: blk_peek_request+0xe0/0x280
[   83.531395] Caller[00000000006e68e4]: scsi_request_fn+0x44/0x580
[   83.610313] Caller[000000000062cbb4]: __blk_run_queue+0x34/0x60
[   83.688087] Caller[0000000000648180]: cfq_insert_request+0x2c0/0x4e0
[   83.771580] Caller[000000000062a1c8]: __elv_add_request+0x1c8/0x2e0
[   83.853927] Caller[0000000000630734]: blk_flush_plug_list+0x214/0x280
[   83.938563] Caller[0000000000816934]: io_schedule+0x54/0xc0
[   84.011761] Caller[0000000000584604]: sleep_on_buffer+0x4/0x20
[   84.088393] Caller[0000000000816fbc]: __wait_on_bit+0x7c/0xe0
[   84.163878] Caller[0000000000817064]: out_of_line_wait_on_bit+0x44/0x60
[   84.250800] Caller[0000000000586ce0]: __bread+0xc0/0x100
[   84.320568] Caller[00000000005cb9a4]: ext3_get_branch+0xa4/0x180
[   84.399487] Caller[00000000005cee3c]: ext3_get_blocks_handle+0x13c/0xf40
[   84.487553] Caller[00000000005cfcc4]: ext3_get_block+0x84/0xe0
[   84.564188] Caller[00000000005916b8]: do_mpage_readpage+0x358/0x620
[   84.646532] Caller[0000000000591a7c]: mpage_readpages+0x9c/0x120
[   84.725452] Caller[0000000000507188]: __do_page_cache_readahead+0x148/0x200
[   84.816947] Caller[0000000000507514]: ra_submit+0x14/0x40
[   84.887862] Caller[00000000004fcad8]: filemap_fault+0x318/0x400
[   84.965639] Caller[000000000051fba8]: __do_fault+0x48/0x4c0
[   85.038832] Caller[0000000000523a78]: handle_mm_fault+0x258/0xaa0
[   85.118899] Caller[000000000044e57c]: do_sparc64_fault+0x41c/0x820
[   85.200097] Caller[0000000000407a58]: sparc64_realfault_common+0x10/0x20
[   85.288165] Caller[00000000005a7008]: padzero+0x28/0x40
[   85.356787] Caller[00000000005a7968]: load_elf_binary+0x728/0x1240
[   85.437997] Caller[000000000055b214]: search_binary_handler+0xb4/0x260
[   85.523770] Instruction DUMP: c606600c  84086003  0ac08031 <ce5e8000> 8e09e00                                                                                                                                                             3  c626a008  e026a00c  82104007  c2768000
[  109.021944] BUG: soft lockup - CPU#1 stuck for 22s! [udevd:234]
[  109.021955] BUG: soft lockup - CPU#0 stuck for 22s! [udevd:229]
[  109.021998] Modules linked in: ohci_pci ohci_hcd ehci_hcd nouveau ttm drm_kms_helper snd_ali5451 snd_ac97_codec ac97_bus snd_pcm snd_page_alloc usbcore snd_timer usb_common snd tg3 drm ptp soundcore i2c_algo_bit pps_core libphy uio_pdrv_genirq uio
[  109.022007] CPU: 0 PID: 229 Comm: udevd Tainted: G      D      3.13.0-rc2-ghost-3-sparcnuma-g1de425c-dirty #1
[  109.022012] task: fffffc033c923720 ti: fffffc033ca38000 task.ti: fffffc033ca38000
[  109.022017] TSTATE: 0000004480001603 TPC: 0000000000819650 TNPC: 0000000000819654 Y: 00000000    Tainted: G      D
[  109.022031] TPC: <_raw_write_lock_bh+0x50/0x100>
[  109.022034] g0: 000000000000099e g1: 0000000000000000 g2: 00000000000000ff g3: 000000000000000e
[  109.022037] g4: fffffc033c923720 g5: fffffc033d5f6000 g6: fffffc033ca38000 g7: 0000000003126e98
[  109.022040] o0: fffffc033c8c0408 o1: 00000000000007f5 o2: 0000000000000000 o3: 000000137c50adaf
[  109.022043] o4: 00000000000137c5 o5: 00000000000007f5 sp: fffffc033fffb431 ret_pc: 0000000000635cb0
[  109.022055] RPC: <blk_rq_timed_out_timer+0x10/0xa0>
[  109.022058] l0: 000000000081f000 l1: 0000000000000000 l2: 0000000000000001 l3: fffffc033e004bb0
[  109.022061] l4: 00000000f7efc938 l5: 00000000f7ee9720 l6: 00000000f7c68124 l7: 00000000f7efc000
[  109.022064] i0: fffffc033c8c0000 i1: 0000000000000000 i2: 0000000000000000 i3: fffffc033e003618
[  109.022067] i4: fffffc033c8c0540 i5: fffffc033e003768 i6: fffffc033fffb4f1 i7: 0000000000467a78
[  109.022077] I7: <call_timer_fn+0x18/0x140>
[  109.022078] Call Trace:
[  109.022083]  [0000000000467a78] call_timer_fn+0x18/0x140
[  109.022087]  [00000000004680bc] run_timer_softirq+0x23c/0x280
[  109.022098]  [0000000000460400] __do_softirq+0xc0/0x2a0
[  109.022107]  [000000000042b9ec] do_softirq_own_stack+0x2c/0x40
[  109.022111]  [0000000000460984] irq_exit+0xa4/0xc0
[  109.022119]  [000000000042f9cc] timer_interrupt+0xac/0xe0
[  109.022126]  [00000000004209d4] tl0_irq14+0x14/0x20
[  109.022130]  [0000000000819230] _raw_spin_unlock_irqrestore+0x10/0x40
[  109.022134]  [000000000045de50] do_exit+0x590/0xa00
[  109.022137]  [000000000045e3e8] do_group_exit+0x28/0xc0
[  109.022141]  [000000000045e494] SyS_exit_group+0x14/0x20
[  109.022151]  [0000000000406174] linux_sparc_syscall32+0x34/0x40
[  111.819554] Modules linked in: ohci_pci ohci_hcd ehci_hcd nouveau ttm drm_kms_helper snd_ali5451 snd_ac97_codec ac97_bus snd_pcm snd_page_alloc usbcore snd_timer usb_common snd tg3 drm ptp soundcore i2c_algo_bit pps_core libphy uio_pdrv_genirq uio
[  112.107775] CPU: 1 PID: 234 Comm: udevd Tainted: G      D      3.13.0-rc2-ghost-3-sparcnuma-g1de425c-dirty #1
[  112.238168] task: fffffc033c9255c0 ti: fffffc033ca78000 task.ti: fffffc033ca78000
[  112.336527] TSTATE: 0000004411001603 TPC: 0000000000819230 TNPC: 0000000000819234 Y: 0000b7f8    Tainted: G      D
[  112.478356] TPC: <_raw_spin_unlock_irqrestore+0x10/0x40>
[  112.548111] g0: fffffc133c09afc8 g1: 0000000000000000 g2: 0000000000000000 g3: 0000000000980058
[  112.662489] g4: fffffc033c9255c0 g5: fffffc133c9f6000 g6: fffffc033ca78000 g7: 0000000000000001
[  112.776863] o0: fffffc133ba53e10 o1: 0000000000000000 o2: 0000000000000001 o3: fffffc033c9255c0
[  112.891236] o4: 0000000000000000 o5: 000000000000000e sp: fffffc033ca7b3c1 ret_pc: 0000000000633a58
[  113.010187] RPC: <put_io_context_active+0xd8/0x100>
[  113.074227] l0: fffffc033c925a18 l1: 0000000000100100 l2: 0000000000000001 l3: 00000000008ce2f0
[  113.188606] l4: 00000000f7efc938 l5: 00000000f7ee9720 l6: 00000000f7c68124 l7: 00000000f7efc000
[  113.302978] i0: fffffc133ba53e00 i1: 0000000000000014 i2: 0000000000000000 i3: fffffc133ba645a0
[  113.417350] i4: fffffc133ba53e10 i5: fffffc133ba53e00 i6: fffffc033ca7b471 i7: 000000000045de50
[  113.531728] I7: <do_exit+0x590/0xa00>
[  113.579752] Call Trace:
[  113.611777]  [000000000045de50] do_exit+0x590/0xa00
[  113.675827]  [000000000045e3e8] do_group_exit+0x28/0xc0
[  113.744451]  [000000000045e494] SyS_exit_group+0x14/0x20
[  113.814224]  [0000000000406174] linux_sparc_syscall32+0x34/0x40


On Mon, Apr 21, 2014 at 3:55 AM, Kirill Tkhai <tkhai@yandex.ru> wrote:
Hi, Patrick,
 
have you tried the patch we spoke about?
 
Could you please test it?
 
Kirill
 
06.12.2013, 00:44, "Kirill Tkhai" <tkhai@yandex.ru>:
I attach the patch as file
 
Kirill
 
05.12.2013, 21:00, "Patrick Baggett" <baggett.patrick@gmail.com>:
Kirill,

I copied the contents of your email into a patch file, and when I did git apply --check, it didn't really work:

figgles@ghost:~/sparc-numa/sparc$ git apply --check numa.patch
error: patch failed: arch/sparc/include/asm/trap_block.h:138
error: arch/sparc/include/asm/trap_block.h: patch does not apply
error: patch failed: arch/sparc/kernel/smp_64.c:285
error: arch/sparc/kernel/smp_64.c: patch does not apply
error: patch failed: arch/sparc/kernel/trampoline_64.S:117
error: arch/sparc/kernel/trampoline_64.S: patch does not apply
error: patch failed: arch/sparc/mm/init_64.c:591
error: arch/sparc/mm/init_64.c: patch does not apply
I'm probably doing something dumb, so let me know what I need to change. ;)

Patrick
 


On Wed, Dec 4, 2013 at 3:56 AM, Kirill Tkhai <tkhai@yandex.ru> wrote:
Hi,

I'm looking for a person who has sparc64 machine with NUMA. The patch below adds
NUMA kernel text replication support. This should improve sparc64 kernel performance
a little bit.

I tested it on my machines, and it looks working for me. But they are not standard
sun v9. So person with standard vanila-supported machine is seeked!

Is anybody able to help me?

It's necessary to 1)clone David Miller's git tree:

git clone --depth=1 git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc.git

2)apply the patch and 3)do not forget to enable CONFIG_NUMA in xconfig/menuconfig.

The following actions is to do a boot test. If everything is OK, I'll be very thankful
if you're able to execute any short performance test: before patch and with it.

Thanks!

Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
---
 arch/sparc/include/asm/page_64.h    |    3 +
 arch/sparc/include/asm/pgtable_64.h |    6 ++
 arch/sparc/include/asm/trap_block.h |   17 ++++++
 arch/sparc/kernel/smp_64.c          |    8 ++-
 arch/sparc/kernel/trampoline_64.S   |   46 ++++++++++++++---
 arch/sparc/mm/init_64.c             |   94 ++++++++++++++++++++++++++++++++++-
 arch/sparc/mm/init_64.h             |    2 +-
 7 files changed, 163 insertions(+), 13 deletions(-)
diff --git a/arch/sparc/include/asm/page_64.h b/arch/sparc/include/asm/page_64.h
index aac53fc..5a85352 100644
--- a/arch/sparc/include/asm/page_64.h
+++ b/arch/sparc/include/asm/page_64.h
@@ -8,6 +8,9 @@
 #define PAGE_SIZE    (_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK    (~(PAGE_SIZE-1))

+#define PAGE4MB_SHIFT          22
+#define PAGE4MB_SIZE           (_AC(1,UL) << PAGE4MB_SHIFT)
+
 /* Flushing for D-cache alias handling is only needed if
  * the page size is smaller than 16K.
  */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 8358dc1..0b0495f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -884,6 +884,12 @@ extern pmd_t swapper_low_pmd_dir[PTRS_PER_PMD];
 extern void paging_init(void);
 extern unsigned long find_ecache_flush_span(unsigned long size);

+#ifdef CONFIG_NUMA
+extern void numa_copy_kernel_text(void);
+#else
+static inline void numa_copy_kernel_text(void) {}
+#endif
+
 struct seq_file;
 extern void mmu_info(struct seq_file *);

diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
index 7e26b2d..a2f0990 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -138,6 +138,23 @@ extern struct sun4v_2insn_patch_entry __sun4v_2insn_patch,
        nop;                                            \
        .previous;

+#ifdef CONFIG_NUMA
+
+#define __GET_NODEID(REG, TMP)                         \
+       __GET_CPUID(REG)                                \
+       sethi   %hi(numa_cpu_lookup_table), TMP;        \
+       or      TMP, %lo(numa_cpu_lookup_table), TMP;   \
+       sllx    REG, 2, REG;                            \
+       add     TMP, REG, TMP;                          \
+       lduw    [TMP], REG;
+
+#else /* !CONFIG_NUMA */
+
+#define __GET_NODEID(REG, TMP)                         \
+       clr     REG
+
+#endif /* !CONFIG_NUMA */
+
 #ifdef CONFIG_SMP

 #define TRAP_LOAD_TRAP_BLOCK(DEST, TMP)                \
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index b66a533..554a0c5 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -285,7 +285,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
                                void **descrp)
 {
        extern unsigned long sparc64_ttable_tl0;
-       extern unsigned long kern_locked_tte_data;
+       extern unsigned long kern_locked_tte_data[MAX_NUMNODES];
        struct hvtramp_descr *hdesc;
        unsigned long trampoline_ra;
        struct trap_per_cpu *tb;
@@ -315,7 +315,7 @@ static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg,
        hdesc->thread_reg = thread_reg;

        tte_vaddr = (unsigned long) KERNBASE;
-       tte_data = kern_locked_tte_data;
+       tte_data = kern_locked_tte_data[0];

        for (i = 0; i < hdesc->num_mappings; i++) {
                hdesc->maps[i].vaddr = tte_vaddr;
@@ -1214,6 +1214,10 @@ int setup_profiling_timer(unsigned int multiplier)

 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+       /* Dublicate kernel on every node. Do this after
+        * all kernel patches are applied.
+        */
+       numa_copy_kernel_text();
 }

 void smp_prepare_boot_cpu(void)
diff --git a/arch/sparc/kernel/trampoline_64.S b/arch/sparc/kernel/trampoline_64.S
index ad4bde3..e5a4f85 100644
--- a/arch/sparc/kernel/trampoline_64.S
+++ b/arch/sparc/kernel/trampoline_64.S
@@ -117,26 +117,42 @@ startup_continue:
        flushw

        /* Setup the loop variables:
+        * %l1: Number of 4MB pages containing not-init kernel text
+        * %l2: TTE base of node 0. Used for DTLB and for rest of __init text
+        *      ITLB mappings. See numa_alloc_kernel_text() for details.
         * %l3: VADDR base
-        * %l4: TTE base
+        * %l4: TTE base of current node. Used for ITLB.
         * %l5: Loop iterator, iterates from 0 to 'num_kernel_image_mappings'
         * %l6: Number of TTE entries to map
         * %l7: Highest TTE entry number, we count down
         */
        sethi           %hi(KERNBASE), %l3
        sethi           %hi(kern_locked_tte_data), %l4
-       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
+       or              %l4, %lo(kern_locked_tte_data), %l4
+       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
+
+       __GET_NODEID(%g2, %g1)
+       sllx            %g2, 3, %g2
+       add             %l4, %g2, %l4
+       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
+
        clr             %l5
        sethi           %hi(num_kernel_image_mappings), %l6
        lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6

+       sethi           %hi(num_node_copy_mappings), %l1
+       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
+
        mov             15, %l7
        BRANCH_IF_ANY_CHEETAH(g1,g5,2f)

        mov             63, %l7
 2:
-
-3:
+       cmp             %l5, %l1        !__init section
+       bne             4f
+        nop
+       mov             %l2, %l4        !use node 0 TTE
+4:
        /* Lock into I-MMU */
        sethi           %hi(call_method), %g2
        or              %g2, %lo(call_method), %g2
@@ -190,7 +206,7 @@ startup_continue:

        add             %l3, %g1, %g2
        stx             %g2, [%sp + 2047 + 128 + 0x28]  ! VADDR
-       add             %l4, %g1, %g2
+       add             %l2, %g1, %g2
        stx             %g2, [%sp + 2047 + 128 + 0x30]  ! TTE

        /* TTE index is highest minus loop index.  */
@@ -205,7 +221,7 @@ startup_continue:

        add             %l5, 1, %l5
        cmp             %l5, %l6
-       bne,pt          %xcc, 3b
+       bne,pt          %xcc, 2b
         nop

        sethi           %hi(prom_entry_lock), %g2
@@ -217,12 +233,26 @@ startup_continue:
 niagara_lock_tlb:
        sethi           %hi(KERNBASE), %l3
        sethi           %hi(kern_locked_tte_data), %l4
-       ldx             [%l4 + %lo(kern_locked_tte_data)], %l4
+       or              %l4, %lo(kern_locked_tte_data), %l4
+       ldx             [%l4], %l2      ! kern_locked_tte_data[0]
+
+       __GET_NODEID(%g2, %g1)
+       sllx            %g2, 3, %g2
+       add             %l4, %g2, %l4
+       ldx             [%l4], %l4      ! kern_locked_tte_data[node]
+
        clr             %l5
        sethi           %hi(num_kernel_image_mappings), %l6
        lduw            [%l6 + %lo(num_kernel_image_mappings)], %l6

+       sethi           %hi(num_node_copy_mappings), %l1
+       lduw            [%l1 + %lo(num_node_copy_mappings)], %l1
 1:
+       cmp             %l5, %l1        !__init section
+       bne             4f
+        nop
+       mov             %l2, %l4        !use node 0 TTE
+4:
        mov             HV_FAST_MMU_MAP_PERM_ADDR, %o5
        sllx            %l5, 22, %g2
        add             %l3, %g2, %o0
@@ -235,7 +265,7 @@ niagara_lock_tlb:
        sllx            %l5, 22, %g2
        add             %l3, %g2, %o0
        clr             %o1
-       add             %l4, %g2, %o2
+       add             %l2, %g2, %o2
        mov             HV_MMU_DMMU, %o3
        ta              HV_FAST_TRAP

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e53..0183213 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -186,6 +186,7 @@ unsigned long sparc64_kern_pri_nuc_bits __read_mostly;
 unsigned long sparc64_kern_sec_context __read_mostly;

 int num_kernel_image_mappings;
+int num_node_copy_mappings;

 #ifdef CONFIG_DEBUG_DCFLUSH
 atomic_t dcpage_flushes = ATOMIC_INIT(0);
@@ -477,7 +478,7 @@ void mmu_info(struct seq_file *m)
 struct linux_prom_translation prom_trans[512] __read_mostly;
 unsigned int prom_trans_ents __read_mostly;

-unsigned long kern_locked_tte_data;
+unsigned long kern_locked_tte_data[MAX_NUMNODES];

 /* The obp translations are saved based on 8k pagesize, since obp can
  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
@@ -591,7 +592,7 @@ static void __init remap_kernel(void)
        phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
        tte_data = kern_large_tte(phys_page);

-       kern_locked_tte_data = tte_data;
+       kern_locked_tte_data[0] = tte_data;

        /* Now lock us into the TLBs via Hypervisor or OBP. */
        if (tlb_type == hypervisor) {
@@ -1330,6 +1331,79 @@ static void __init bootmem_init_nonnuma(void)
        node_set_online(0);
 }

+#ifdef CONFIG_NUMA
+
+/* Allocate memory for per-node copy of kernel text.
+ * The copying itself will be made after all kernel
+ * patches are applied.
+ */
+static void __init numa_alloc_kernel_text(void)
+{
+       unsigned long init_start = (unsigned long)__init_begin;
+       unsigned int size, node;
+
+       /* The rest init text will be mapped from the original image.
+        */
+       size = round_up(init_start - KERNBASE, PAGE4MB_SIZE);
+       num_node_copy_mappings = size >> PAGE4MB_SHIFT;
+
+       for (node = 1; node < num_node_masks; node++) {
+               unsigned long tte_data;
+               phys_addr_t new_base_pa;
+
+               new_base_pa = memblock_alloc_nid(size, PAGE4MB_SIZE, node);
+
+               if (new_base_pa) {
+                       pr_info("node %d: Allocated memory for copy of "
+                               "kernel text: [%016llx, %016llx]\n",
+                                node, new_base_pa, new_base_pa + size);
+                       tte_data = kern_large_tte(new_base_pa);
+               } else {
+                       pr_err("node %d: Can't allocate memory for kernel "
+                              "text duplicate\n", node);
+                       tte_data = kern_locked_tte_data[0];
+               }
+
+               kern_locked_tte_data[node] = tte_data;
+       }
+}
+
+/* Dublicate kernel text on every NUMA node.
+ * Do not copy pages which contain only init text,
+ * because they are mapped from original kernel.
+ */
+void numa_copy_kernel_text(void)
+{
+       unsigned int size, node;
+       unsigned long tte_data0;
+
+       size = num_node_copy_mappings << PAGE4MB_SHIFT;
+       tte_data0 = kern_locked_tte_data[0];
+
+       for (node = 1; node < num_node_masks; node++) {
+               unsigned long tte_data, phys_addr;
+
+               tte_data = kern_locked_tte_data[node];
+
+               if (tte_data == tte_data0)
+                       continue;
+
+               /* PA is [42:12] range */
+               phys_addr = (((tte_data << 21) >> 21) >> 13) << 13;
+
+               memcpy(__va(phys_addr), (void *)KERNBASE, size);
+       }
+}
+
+#else /* CONFIG_NUMA */
+
+static void __init numa_alloc_kernel_text(void)
+{
+}
+
+#endif /* CONFIG_NUMA */
+
+
 static unsigned long __init bootmem_init(unsigned long phys_base)
 {
        unsigned long end_pfn;
@@ -1341,6 +1415,8 @@ static unsigned long __init bootmem_init(unsigned long phys_base)
        if (bootmem_init_numa() < 0)
                bootmem_init_nonnuma();

+       numa_alloc_kernel_text();
+
        /* Dump memblock with node info. */
        memblock_dump_all();

@@ -1922,6 +1998,9 @@ void __init paging_init(void)
                memblock_add(pavail[i].phys_addr, pavail[i].reg_size);
        }

+#ifdef CONFIG_NUMA
+       kern_size = round_up(kern_size, PAGE4MB_SIZE);
+#endif
        memblock_reserve(kern_base, kern_size);

        find_ramdisk(phys_base);
@@ -2188,6 +2267,17 @@ void free_initmem(void)
         * The init section is aligned to 8k in vmlinux.lds. Page align for >8k pagesizes.
         */
        addr = PAGE_ALIGN((unsigned long)(__init_begin));
+
+#ifdef CONFIG_NUMA
+       if (num_node_masks > 1) {
+               /* Do not free 4KB pages which are lying at 4MB page
+                * together with normal kernel text. Their addresses
+                * are forbidden forever.
+                */
+               addr = round_up(addr, PAGE4MB_SIZE);
+       }
+#endif
+
        initend = (unsigned long)(__init_end) & PAGE_MASK;
        for (; addr < initend; addr += PAGE_SIZE) {
                unsigned long page;
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 5d3782de..a14c8d8 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -34,7 +34,7 @@ extern struct linux_prom_translation prom_trans[512];
 extern unsigned int prom_trans_ents;

 /* Exported for SMP bootup purposes. */
-extern unsigned long kern_locked_tte_data;
+extern unsigned long kern_locked_tte_data[MAX_NUMNODES];

 extern void prom_world(int enter);



--
To UNSUBSCRIBE, email to debian-sparc-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org
Archive: http://lists.debian.org/176311386150980@web5m.yandex.ru



Reply to: