[Pacemaker] Corosync 2.3 dies randomly

Robert Parsons rparsons at tappublishing.com
Tue May 7 17:01:31 EDT 2013


> Corosync has a blackbox - did you interrogate that too?

I grabbed the latest source from github and the problem remains. Here's 
some diagnostic output:


strace on the corosync process:


recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999), 
sin_addr=inet_addr("10.1.4.133")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\303\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0)     = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0)     = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 951478063}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000), 
sin_addr=inet_addr("10.1.4.122")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\304\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 952476414}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955080092}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955358106}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955640730}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 956315715}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 956515750}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 57) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 957009047}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 957280723}) = 0
recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999), 
sin_addr=inet_addr("10.1.4.133")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\311\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0)     = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0)     = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 958503649}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000), 
sin_addr=inet_addr("10.1.4.122")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\312\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 959145106}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959414728}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959635399}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959929783}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 960568631}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 961997811}) = 0
recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999), 
sin_addr=inet_addr("10.1.4.133")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\317\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0)     = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0)     = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 963223314}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000), 
sin_addr=inet_addr("10.1.4.122")}, 
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\320\5\0\0o\0\0\0\0\0\0\0w\4\1\n"..., 
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 963959031}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 964185154}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 964537071}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 965621813}) = 0
epoll_wait(4, {{EPOLLIN, {u32=10, u64=4183101728859619338}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 966212452}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 966413788}) = 0
epoll_wait(4, {{EPOLLIN, {u32=10, u64=4183101728859619338}}}, 12, 47) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 966908130}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 967178141}) = 0
recvmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999), 
sin_addr=inet_addr("10.1.4.122")}, 
msg_iov(1)=[{"\376\376\0\0\1\2\"\377z\4\1\n\2z\4\1\n\2\0\n\1\4z\0\0\0\0\0\0\0\0\0"..., 
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 1472
--- SIGBUS (Bus error) @ 0 (0) ---


corosync-blackbox:


Failed to initialize the cmap API. Error CS_ERR_LIBRARY
Failed to initialize the cmap API. Error CS_ERR_LIBRARY
Dumping the contents of /var/lib/corosync/fdata
[debug] shm size:8388608; real_size:8388608; rb->word_size:2097152
[debug] read total of: 8388620
Ringbuffer:
  ->NORMAL
  ->write_pt [673]
  ->read_pt [0]
  ->size [2097152 words]
  =>free [8385912 bytes]
  =>used [2692 bytes]
debug   May 07 15:26:52 handle_new_connection(476):2147483648: IPC 
credentials authenticated (13438-14116-18)
debug   May 07 15:26:52 qb_ipcs_shm_connect(294):9: connecting to client 
[14116]
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 cs_ipcs_connection_created(269):8: connection 
created
debug   May 07 15:26:52 cmap_lib_init_fn(306):9: lib_init_fn: 
conn=0x7feb08b0ea00
debug   May 07 15:26:52 qb_ipcs_dispatch_connection_request(723):9: HUP 
conn (13438-14116-18)
debug   May 07 15:26:52 qb_ipcs_disconnect(565):9: 
qb_ipcs_disconnect(13438-14116-18) state:2
debug   May 07 15:26:52 _del(117):9: epoll_ctl(del): Bad file descriptor (9)
debug   May 07 15:26:52 cs_ipcs_connection_closed(414):8: 
cs_ipcs_connection_closed()
debug   May 07 15:26:52 cmap_lib_exit_fn(325):9: exit_fn for 
conn=0x7feb08b0ea00
debug   May 07 15:26:52 cs_ipcs_connection_destroyed(387):8: 
cs_ipcs_connection_destroyed()
trace   May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace   May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING 
my_posix_sem_destroy()
debug   May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer: 
/dev/shm/qb-cmap-response-13438-14116-18-header
trace   May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace   May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING 
my_posix_sem_destroy()
debug   May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer: 
/dev/shm/qb-cmap-event-13438-14116-18-header
trace   May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace   May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING 
my_posix_sem_destroy()
debug   May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer: 
/dev/shm/qb-cmap-request-13438-14116-18-header
debug   May 07 15:26:52 handle_new_connection(476):2147483648: IPC 
credentials authenticated (13438-14118-18)
debug   May 07 15:26:52 qb_ipcs_shm_connect(294):9: connecting to client 
[14118]
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd; 
rb->word_size:1048576
debug   May 07 15:26:52 cs_ipcs_connection_created(269):8: connection 
created
debug   May 07 15:26:52 cmap_lib_init_fn(306):9: lib_init_fn: 
conn=0x7feb08b0ea00
ERROR: qb_rb_chunk_read failed: Connection timed out
[trace] ENTERING qb_rb_close()
[debug] Free'ing ringbuffer: /dev/shm/qb-create_from_file-header



syslog at moment of death



May  7 15:47:34 krusty corosync[4660]:   [QUORUM] total_votes=6, 
expected_votes=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838839 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838841 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838842 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838844 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838845 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838846 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838849 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838850 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838852 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838853 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838854 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] got nodeinfo message 
from cluster node 167838844
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] nodeinfo message[0]: 
votes: 0, expected: 0 flags: 0
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] got nodeinfo message 
from cluster node 167838846
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] nodeinfo 
message[167838846]: votes: 1, expected: 17 flags: 0
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] flags: quorate: No 
Leaving: No WFA Status: No First: No Qdevice: No QdeviceAlive: No 
QdeviceCastVote: No QdeviceMasterWins: No
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] total_votes=6, 
expected_votes=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838839 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838841 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838842 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838844 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838845 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838846 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838849 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838850 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838852 
state=2, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838853 
state=1, votes=1, expected=17
May  7 15:47:34 krusty corosync[4660]:   [QUORUM] node 167838854 
state=2, votes=1, expected=17
May  7 15:47:34 krusty rsyslogd-2177: imuxsock begins to drop messages 
from pid 4660 due to rate-limiting
May  7 15:47:34 krusty crmd[5631]:   notice: 
corosync_mark_unseen_peer_dead: Node 167838845/maude was not seen in the 
previous transition
May  7 15:47:34 krusty crmd[5631]:   notice: crm_update_peer_state: 
corosync_mark_unseen_peer_dead: Node maude[167838845] - state is now 
lost (was member)
May  7 15:47:36 krusty crmd[5631]:    error: crmd_quorum_destroy: 
connection terminated
May  7 15:47:36 krusty crmd[5631]:    error: 
crmd_cib_connection_destroy: Connection to the CIB terminated...
May  7 15:47:36 krusty stonith-ng[5629]:    error: pcmk_cpg_dispatch: 
Connection to the CPG API failed: 2
May  7 15:47:36 krusty stonith-ng[5629]:    error: 
stonith_peer_ais_destroy: AIS connection terminated
May  7 15:47:36 krusty attrd[5630]:    error: pcmk_cpg_dispatch: 
Connection to the CPG API failed: 2
May  7 15:47:36 krusty attrd[5630]:     crit: attrd_ais_destroy: Lost 
connection to Corosync service!
May  7 15:47:36 krusty attrd[5630]:   notice: main: Exiting...
May  7 15:47:36 krusty attrd[5630]:    error: 
attrd_cib_connection_destroy: Connection to the CIB terminated...
May  7 15:47:36 krusty cib[5628]:    error: pcmk_cpg_dispatch: 
Connection to the CPG API failed: 2
May  7 15:47:36 krusty cib[5628]:    error: cib_ais_destroy: Corosync 
connection lost!  Exiting.


The process does not produce an error message and does not dump core.

Thanks for any help.

- Rob P.










More information about the Pacemaker mailing list