From 6779e5da698feb9b9e02411859ad81885ba46c01 Mon Sep 17 00:00:00 2001 From: Maria Matejka Date: Fri, 20 Dec 2024 11:28:00 +0100 Subject: [PATCH] BGP: fix locking order error on dynamic protocol spawn We missed that the protocol spawner violates the prescribed locking order. When the rtable level is locked, no new protocol can be started, thus we need to: * create the protocol from a clean mainloop context * in protocol start hook, take the socket Testsuite: cf-bgp-autopeer Fixes: #136 Thanks to Job Snijders for reporting: https://trubka.network.cz/pipermail/bird-users/2024-December/017980.html --- nest/proto.c | 19 +++++++++++++++++++ nest/protocol.h | 2 ++ proto/bgp/bgp.c | 46 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/nest/proto.c b/nest/proto.c index dded84f51..678697d69 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -1867,6 +1867,25 @@ proto_spawn(struct proto_config *cf, uint disabled) return p; } +bool +proto_disable(struct proto *p) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + bool changed = !p->disabled; + p->disabled = 1; + proto_rethink_goal(p); + return changed; +} + +bool +proto_enable(struct proto *p) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + bool changed = p->disabled; + p->disabled = 0; + proto_rethink_goal(p); + return changed; +} /** * DOC: Graceful restart recovery diff --git a/nest/protocol.h b/nest/protocol.h index 25ed6f553..cf7ecb898 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -78,6 +78,8 @@ void proto_build(struct protocol *); /* Called from protocol to register itself void protos_preconfig(struct config *); void protos_commit(struct config *new, struct config *old, int type); struct proto * proto_spawn(struct proto_config *cf, uint disabled); +bool proto_disable(struct proto *p); +bool proto_enable(struct proto *p); void protos_dump_all(struct dump_request *); #define GA_UNKNOWN 0 /* Attribute not recognized */ diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 5fc2b5fff..3170e3a42 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -378,8 +378,6 @@ bgp_startup(struct bgp_proto *p) if (p->postponed_sk) { /* Apply postponed incoming connection */ - sk_reloop(p->postponed_sk, p->p.loop); - bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, p->postponed_sk); bgp_send_open(&p->incoming_conn); @@ -583,6 +581,9 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len static void bgp_down(struct bgp_proto *p) { + /* Check that the dynamic BGP socket has been picked up */ + ASSERT_DIE(p->postponed_sk == NULL); + if (bgp_start_state(p) > BSS_PREPARE) { bgp_setup_auth(p, 0); @@ -617,8 +618,8 @@ bgp_decision(void *vp) bgp_down(p); } -static struct bgp_proto * -bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) +static void +bgp_spawn(struct bgp_proto *pp, struct birdsock *sk) { struct symbol *sym; char fmt[SYM_MAX_LEN]; @@ -635,9 +636,16 @@ bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) cfg_mem = NULL; /* Just pass remote_ip to bgp_init() */ - ((struct bgp_config *) sym->proto)->remote_ip = remote_ip; + ((struct bgp_config *) sym->proto)->remote_ip = sk->daddr; + + /* Create the protocol disabled initially */ + SKIP_BACK_DECLARE(struct bgp_proto, p, p, proto_spawn(sym->proto, 1)); - return (void *) proto_spawn(sym->proto, 0); + /* Pass the socket */ + p->postponed_sk = sk; + + /* And enable the protocol */ + proto_enable(&p->p); } void @@ -1489,10 +1497,15 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) /* For dynamic BGP, spawn new instance and postpone the socket */ if (bgp_is_dynamic(p)) { - p = bgp_spawn(p, sk->daddr); - p->postponed_sk = sk; - rmove(sk, p->p.pool); - goto leave; + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + /* The dynamic protocol must be in the START state */ + ASSERT_DIE(p->p.proto_state == PS_START); + birdloop_leave(p->p.loop); + + /* Now we have a clean mainloop */ + bgp_spawn(p, sk); + return 0; } rmove(sk, p->p.pool); @@ -1806,7 +1819,6 @@ bgp_start(struct proto *P) p->incoming_conn.state = BS_IDLE; p->neigh = NULL; p->bfd_req = NULL; - p->postponed_sk = NULL; p->gr_ready = 0; p->gr_active_num = 0; @@ -1848,6 +1860,16 @@ bgp_start(struct proto *P) channel_graceful_restart_lock(&c->c); } + /* Now it's the last chance to move the postponed socket to this BGP, + * as bgp_start is the only hook running from main loop. */ + if (p->postponed_sk) + { + LOCK_DOMAIN(rtable, bgp_listen_domain); + rmove(p->postponed_sk, p->p.pool); + sk_reloop(p->postponed_sk, p->p.loop); + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + } + /* * Before attempting to create the connection, we need to lock the port, * so that we are the only instance attempting to talk with that neighbor. @@ -1999,6 +2021,8 @@ bgp_init(struct proto_config *CF) p->remote_ip = cf->remote_ip; p->remote_as = cf->remote_as; + p->postponed_sk = NULL; + /* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */ if (cf->c.parent) cf->remote_ip = IPA_NONE; -- GitLab