[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: libports and interrupted RPCs



On 31/08/2025 22:47, Samuel Thibault wrote:
Michael Kelly, le sam. 30 août 2025 21:29:46 +0100, a ecrit:
This sequence of hurd_thread_cancel() calls all occur whilst a single
process wide mutex is held locked (see libports:interrupt_rpcs.c).
You mean the _ports_lock mutex?
Yes.
The same lock is also required to begin or end other RPCs on other
ports and so they must wait until the initial interrupt_operation
completes.
? I don't think ports_interrupt_rpcs actually waits for something to
finish? hurd_thread_cancel() should be asynchronous, and
_ports_record_interruption clearly is.

I hadn't any evidence to present so today I reran the stress test without my code changes to ports_interrupt_rpcs(). I have attached a reduced version of the very long set of stack traces from the ext2fs server. I have the complete list of all threads saved but it's a bit long for this message. In summary the traces show:

1) One thread (thread: 35) handling an interrupt_operation request. This shows it making a secondary interrupt_operation RPC to a storeio task. The port in use has a msgcount of 5 preventing immediate delivery of this message. The call to interrupt_operation has MACH_SEND_MSG|MACH_RCV_MSG|MACH_RCV_TIMEOUT with a timeout of 1000ms.

db> show kmsg 0xd4499000
kmsg=0xd4499000
ikm_next=0xf49fb000,prev=0xd6316000,size=4096,marequest=0x0
msgh_bits=0x1211: l=0x12,r=0x11
msgh_id=33000,size=24,seqno=0,remote=0xf67cb3c0(port_send),local=0xebfc3b08(port_send_once)
db> show port 0xf67cb3c0
port 0xf67cb3c0
  flags has_protected_payload=1
  active, refs=9, otype=0, kotype=0 (NONE)
  receiver=0xf55cba00, receiver_name=0x12
  mscount=1, srights=7, sorights=1
  nsrequest=0xf67cb3c0, pdrequest=0x0, dnrequests=0xf556bb30
  pset=0xf594eb10, seqno=875, msgcount=5, qlimit=5
  kmsgs=0x0, rcvrs=0x0, sndrs=0xf65ea2e0, kobj=0x0
  protected_payload=20002040

2) There are over 80 rpc calls blocked awaiting _ports_lock.

3) There are 10 other RPCs in progress.

4) A small number of threads for which no memory access is available have been removed from the stack trace listing.

I'm still quite inexperienced with this code so I could be mistaken in my conclusions.

I hope this helps.

Regards,

Mike.
thread:	0 (plus 9 similar stack traces)
Continuation mach_msg_continue
>>>>> user space <<<<<
_hurd_intr_rpc_mach_msg(7feadc,3,6c,434,13)
fsys_getroot(14a,668,13,6113e10,1)
fshelp_fetch_root(2007564c,7ffce0,668,6113e00,1)
diskfs_S_dir_lookup(6201490,7ffe4c,1,0,801e64)
_Xdir_lookup(7ffe30,801e40,0,803f3c,801e40)
diskfs_demuxer(7ffe30,801e40,7ffde0,0,801e40)
synchronized_demuxer(7ffe30,801e40,0,0,1712)
mach_msg_server_timeout(803ee8,2000,b,900,2710)
thread_function(1,10000,0,0,0)
0x807dc69(20001410,805c720,1d4c0,927c0,0)
0x806440b(
no memory is assigned to address 0806440b
805c720,822cf34,803fe8,80831c5,0)
entry_point(200028b0,80643d0,805c720,0,3f400)
0x0()
thread:	1
Continuation mach_msg_receive_continue
>>>>> user space <<<<<
mach_msg_trap(0x80c7d30(1004ee0,802,0,2000,11)
mach_msg_server_timeout(1006f68,2000,11,800,0)
ports_manage_port_operations_one_thread(20001830,1006fa8,0,807051f,0)
service_paging_requests(200031c0,822cf34,1006fe8,80831c5,0)
entry_point(200032c0,80704e0,200031c0,0,8a3c743e)
0x0()
thread:	2 (plus 12 similar stack traces)
Continuation mach_msg_continue
>>>>> user space <<<<<
mach_msg_trap(0x80c7d30(1808ea4,2,0,18,16)
__pthread_block(20003ac0,0,1,0,0)
__pthread_cond_timedwait_internal(200031d0,200031f8,ffffffff,0,8106f09)
pthread_cond_wait(200031d0,200031f8,0,0,0)
worker_func(20003218,822cf34,1808fe8,80831c5,0)
entry_point(20003ac0,80702c0,20003218,0,
no memory is assigned to address 01809000
0)
0x0()
thread:	12 (plus 80+ similar stack traces awaiting _ports_lock mutex)
switch_context(f668ccf8,0,f55db5c8,c1039f86,dd200e68)+0xa6
thread_invoke(f668ccf8,0,f55db5c8,c103aa65)+0xcf
thread_block(0,dd200eb8,4,f4959cd8,9f)+0x40
gsync_wait(f5978af8,82366e0,2,0,0)+0x186
_Xgsync_wait(dd206010,f9ed1010,c7fa64c0,c1124e40,0)+0x65
ipc_kobject_server(dd206000,2,0,1000)+0x93
mach_msg_trap(6afcd7c,3,40,20,1ba)+0x8ac
>>>>> user space <<<<<
mach_msg_trap(0x80c7d30(6afcd7c,3,40,20,1ba)
gsync_wait(2,82366e0,2,0,0)
pthread_mutex_lock(82366e0,6b00f5c,6afce48,807ef8d,20002708)
ports_begin_rpc(2001c220,89b,6afce80,0,0)
internal_demuxer(6afeee0,6afced0,0,0,0)
mach_msg_server_timeout(6b00f68,2000,2f,800,0)
ports_manage_port_operations_one_thread(200026d0,6b00fa8,0,807051f,0)
service_paging_requests(20008d80,822cf34,6b00fe8,80831c5,0)
--db_more--           entry_point(20008e80,80704e0,20008d80,0,
no memory is assigned to address 06b01000
0)
0x0()
thread:	35
switch_context(f65ea2e0,0,f65eacf0,c1039f86,c1125140)+0xa6
thread_invoke(f65ea2e0,0,f65eacf0,c103aa65)+0xcf
thread_block(0,f65ea2e0,d4499000,c1023246,7f34c1c)+0x40
ipc_mqueue_send(d4499000,0,3e8,0)+0xb3
mach_msg_send(7f34c1c,103,18,3e8,0,f9f51000,2c,1000)+0x12d
mach_msg_trap(7f34c1c,103,18,20,257)+0x5e5
>>>>> user space <<<<<
mach_msg_trap(0x80c7d30(7f34c1c,103,18,20,257)
interrupt_operation(14a,3e8,7f34c88,80ccce6,6cff212)
_hurdsig_abort_rpcs(7f8e008,0,0,7f34ccc,7f34cc8)
hurd_thread_cancel(1be,6201490,7f34dc8,807feed,e9d)
ports_interrupt_rpcs(6201490,8083190,803f78,0,0)
ports_S_interrupt_operation(6201490,e9d,803f78,10000,621f3c0)
_Xinterrupt_operation(7f36ef0,7f34ee0,8083c11,803f3c,7f34ee0)
diskfs_demuxer(7f36ef0,7f34ee0,7f34e90,0,7f34ee0)
synchronized_demuxer(7f36ef0,7f34ee0,0,0,12)
mach_msg_server_timeout(7f38f98,2000,b,900,1d4c0)
thread_function(0,822cf34,7f38fe8,80831c5,0)
entry_point(611efd0,803f78,0,0,
no memory is assigned to address 07f39000
0)
0x0()
thread:	121
Continuation mach_msg_receive_continue
>>>>> user space <<<<<
mach_msg_trap(0x80c7d30(1d585ef0,902,0,2000,b)
mach_msg_server_timeout(1d587f98,2000,b,900,1d4c0)
thread_function(0,822cf34,1d587fe8,80831c5,0)
entry_point(20088310,803f78,0,0,66688b92)
0x0()
db>                               

Script done on 2025-09-01 22:04:04+01:00 [COMMAND_EXIT_CODE="0"]

Reply to: