diff --git a/net-misc/bird/bird-3.0.0.ebuild b/net-misc/bird/bird-3.0.0.ebuild index dd31567..56d6759 100644 --- a/net-misc/bird/bird-3.0.0.ebuild +++ b/net-misc/bird/bird-3.0.0.ebuild @@ -37,10 +37,7 @@ FILECAPS=( ) PATCHES=( - "${FILESDIR}"/${P}-proto-lock.patch - "${FILESDIR}"/${P}-nest-rt-table.c.patch - "${FILESDIR}"/${P}-rt-table.c.patch - "${FILESDIR}"/${P}-bgp-med.patch + "${FILESDIR}"/${P}_p20250107.patch ) src_prepare() { diff --git a/net-misc/bird/files/bird-3.0.0-bgp-med.patch b/net-misc/bird/files/bird-3.0.0-bgp-med.patch deleted file mode 100644 index 44403ed..0000000 --- a/net-misc/bird/files/bird-3.0.0-bgp-med.patch +++ /dev/null @@ -1,65 +0,0 @@ -From c5b07695ce810e4345ed1811eadfce935c83b324 Mon Sep 17 00:00:00 2001 -From: Maria Matejka -Date: Tue, 7 Jan 2025 11:08:04 +0100 -Subject: [PATCH] BGP: fixed deterministic med crashes - -There were several places of forgotten NULL checks. - -Thanks to Alarig Le Lay for reporting: -https://trubka.network.cz/pipermail/bird-users/2024-December/017990.html ---- - nest/rt-table.c | 14 ++++++++++++-- - proto/bgp/attrs.c | 8 ++++---- - 2 files changed, 16 insertions(+), 6 deletions(-) - -diff --git a/nest/rt-table.c b/nest/rt-table.c -index 05191d743..fc6d0d4e0 100644 ---- a/nest/rt-table.c -+++ b/nest/rt-table.c -@@ -2024,12 +2024,22 @@ rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct n - do_recalculate: - /* Add the new route to the list right behind the old one */ - if (new_stored) -+ { -+ /* There is the same piece of code several lines farther. Needs refactoring. -+ * The old_stored check is needed because of the possible jump from deterministic med */ -+ if (old_stored) - { - atomic_store_explicit(&new_stored->next, atomic_load_explicit(&old_stored->next, memory_order_relaxed), memory_order_release); - atomic_store_explicit(&old_stored->next, new_stored, memory_order_release); -- -- table->rt_count++; - } -+ else -+ { -+ atomic_store_explicit(&new_stored->next, NULL, memory_order_release); -+ atomic_store_explicit(last_ptr, new_stored, memory_order_release); -+ } -+ -+ table->rt_count++; -+ } - - /* Find a new optimal route (if there is any) */ - struct rte_storage * _Atomic *bp = &local_sentinel.next; -diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c -index 5dc06be51..db6542343 100644 ---- a/proto/bgp/attrs.c -+++ b/proto/bgp/attrs.c -@@ -2689,10 +2689,10 @@ bgp_rte_recalculate(struct rtable_private *table, net *net, - struct rte_storage *new_stored, struct rte_storage *old_stored, struct rte_storage *old_best_stored) - { - struct rte_storage *key_stored = new_stored ? new_stored : old_stored; -- const struct rte *new = &new_stored->rte, -- *old = &old_stored->rte, -- *old_best = &old_best_stored->rte, -- *key = &key_stored->rte; -+ const struct rte *new = RTE_OR_NULL(new_stored), -+ *old = RTE_OR_NULL(old_stored), -+ *old_best = RTE_OR_NULL(old_best_stored), -+ *key = RTE_OR_NULL(key_stored); - - u32 lpref = rt_get_preference(key); - u32 lasn = bgp_get_neighbor(key); --- -GitLab - diff --git a/net-misc/bird/files/bird-3.0.0-nest-rt-table.c.patch b/net-misc/bird/files/bird-3.0.0-nest-rt-table.c.patch deleted file mode 100644 index 50e714e..0000000 --- a/net-misc/bird/files/bird-3.0.0-nest-rt-table.c.patch +++ /dev/null @@ -1,38 +0,0 @@ -From b6caccfd45fb639b6dd3a8d140d3c5ba4cc79311 Mon Sep 17 00:00:00 2001 -From: Maria Matejka -Date: Thu, 19 Dec 2024 11:00:15 +0100 -Subject: [PATCH] Kernel: Fix crash for merge paths on if no route is in BIRD - -There was a missing check for a NULL return value. -Also fixed an indenting error. - -Thanks to Radu Anghel for reporting it: -https://bird.network.cz/pipermail/bird-users/2024-December/017977.html ---- - nest/rt-table.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/nest/rt-table.c b/nest/rt-table.c -index fd8bb50dd..05191d743 100644 ---- a/nest/rt-table.c -+++ b/nest/rt-table.c -@@ -5265,14 +5265,14 @@ krt_export_net(struct channel *c, const net_addr *a, linpool *lp) - if (c->ra_mode == RA_MERGED) - { - struct rt_export_feed *feed = rt_net_feed(c->table, a, NULL); -- if (!feed->count_routes) -+ if (!feed || !feed->count_routes) - return NULL; - - if (!bmap_test(&c->export_accepted_map, feed->block[0].id)) - return NULL; - - return rt_export_merged(c, feed, lp, 1); -- } -+ } - - static _Thread_local rte best; - best = rt_net_best(c->table, a); --- -GitLab - diff --git a/net-misc/bird/files/bird-3.0.0-proto-lock.patch b/net-misc/bird/files/bird-3.0.0-proto-lock.patch deleted file mode 100644 index 3de5e4b..0000000 --- a/net-misc/bird/files/bird-3.0.0-proto-lock.patch +++ /dev/null @@ -1,176 +0,0 @@ -From 6779e5da698feb9b9e02411859ad81885ba46c01 Mon Sep 17 00:00:00 2001 -From: Maria Matejka -Date: Fri, 20 Dec 2024 11:28:00 +0100 -Subject: [PATCH] BGP: fix locking order error on dynamic protocol spawn - -We missed that the protocol spawner violates the prescribed -locking order. When the rtable level is locked, no new protocol can be -started, thus we need to: - -* create the protocol from a clean mainloop context -* in protocol start hook, take the socket - -Testsuite: cf-bgp-autopeer -Fixes: #136 - -Thanks to Job Snijders for reporting: -https://trubka.network.cz/pipermail/bird-users/2024-December/017980.html ---- - nest/proto.c | 19 +++++++++++++++++++ - nest/protocol.h | 2 ++ - proto/bgp/bgp.c | 46 +++++++++++++++++++++++++++++++++++----------- - 3 files changed, 56 insertions(+), 11 deletions(-) - -diff --git a/nest/proto.c b/nest/proto.c -index dded84f51..678697d69 100644 ---- a/nest/proto.c -+++ b/nest/proto.c -@@ -1867,6 +1867,25 @@ proto_spawn(struct proto_config *cf, uint disabled) - return p; - } - -+bool -+proto_disable(struct proto *p) -+{ -+ ASSERT_DIE(birdloop_inside(&main_birdloop)); -+ bool changed = !p->disabled; -+ p->disabled = 1; -+ proto_rethink_goal(p); -+ return changed; -+} -+ -+bool -+proto_enable(struct proto *p) -+{ -+ ASSERT_DIE(birdloop_inside(&main_birdloop)); -+ bool changed = p->disabled; -+ p->disabled = 0; -+ proto_rethink_goal(p); -+ return changed; -+} - - /** - * DOC: Graceful restart recovery -diff --git a/nest/protocol.h b/nest/protocol.h -index 25ed6f553..cf7ecb898 100644 ---- a/nest/protocol.h -+++ b/nest/protocol.h -@@ -78,6 +78,8 @@ void proto_build(struct protocol *); /* Called from protocol to register itself - void protos_preconfig(struct config *); - void protos_commit(struct config *new, struct config *old, int type); - struct proto * proto_spawn(struct proto_config *cf, uint disabled); -+bool proto_disable(struct proto *p); -+bool proto_enable(struct proto *p); - void protos_dump_all(struct dump_request *); - - #define GA_UNKNOWN 0 /* Attribute not recognized */ -diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c -index 5fc2b5fff..3170e3a42 100644 ---- a/proto/bgp/bgp.c -+++ b/proto/bgp/bgp.c -@@ -378,8 +378,6 @@ bgp_startup(struct bgp_proto *p) - if (p->postponed_sk) - { - /* Apply postponed incoming connection */ -- sk_reloop(p->postponed_sk, p->p.loop); -- - bgp_setup_conn(p, &p->incoming_conn); - bgp_setup_sk(&p->incoming_conn, p->postponed_sk); - bgp_send_open(&p->incoming_conn); -@@ -583,6 +581,9 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len - static void - bgp_down(struct bgp_proto *p) - { -+ /* Check that the dynamic BGP socket has been picked up */ -+ ASSERT_DIE(p->postponed_sk == NULL); -+ - if (bgp_start_state(p) > BSS_PREPARE) - { - bgp_setup_auth(p, 0); -@@ -617,8 +618,8 @@ bgp_decision(void *vp) - bgp_down(p); - } - --static struct bgp_proto * --bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) -+static void -+bgp_spawn(struct bgp_proto *pp, struct birdsock *sk) - { - struct symbol *sym; - char fmt[SYM_MAX_LEN]; -@@ -635,9 +636,16 @@ bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) - cfg_mem = NULL; - - /* Just pass remote_ip to bgp_init() */ -- ((struct bgp_config *) sym->proto)->remote_ip = remote_ip; -+ ((struct bgp_config *) sym->proto)->remote_ip = sk->daddr; -+ -+ /* Create the protocol disabled initially */ -+ SKIP_BACK_DECLARE(struct bgp_proto, p, p, proto_spawn(sym->proto, 1)); - -- return (void *) proto_spawn(sym->proto, 0); -+ /* Pass the socket */ -+ p->postponed_sk = sk; -+ -+ /* And enable the protocol */ -+ proto_enable(&p->p); - } - - void -@@ -1489,10 +1497,15 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) - /* For dynamic BGP, spawn new instance and postpone the socket */ - if (bgp_is_dynamic(p)) - { -- p = bgp_spawn(p, sk->daddr); -- p->postponed_sk = sk; -- rmove(sk, p->p.pool); -- goto leave; -+ UNLOCK_DOMAIN(rtable, bgp_listen_domain); -+ -+ /* The dynamic protocol must be in the START state */ -+ ASSERT_DIE(p->p.proto_state == PS_START); -+ birdloop_leave(p->p.loop); -+ -+ /* Now we have a clean mainloop */ -+ bgp_spawn(p, sk); -+ return 0; - } - - rmove(sk, p->p.pool); -@@ -1806,7 +1819,6 @@ bgp_start(struct proto *P) - p->incoming_conn.state = BS_IDLE; - p->neigh = NULL; - p->bfd_req = NULL; -- p->postponed_sk = NULL; - p->gr_ready = 0; - p->gr_active_num = 0; - -@@ -1848,6 +1860,16 @@ bgp_start(struct proto *P) - channel_graceful_restart_lock(&c->c); - } - -+ /* Now it's the last chance to move the postponed socket to this BGP, -+ * as bgp_start is the only hook running from main loop. */ -+ if (p->postponed_sk) -+ { -+ LOCK_DOMAIN(rtable, bgp_listen_domain); -+ rmove(p->postponed_sk, p->p.pool); -+ sk_reloop(p->postponed_sk, p->p.loop); -+ UNLOCK_DOMAIN(rtable, bgp_listen_domain); -+ } -+ - /* - * Before attempting to create the connection, we need to lock the port, - * so that we are the only instance attempting to talk with that neighbor. -@@ -1999,6 +2021,8 @@ bgp_init(struct proto_config *CF) - p->remote_ip = cf->remote_ip; - p->remote_as = cf->remote_as; - -+ p->postponed_sk = NULL; -+ - /* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */ - if (cf->c.parent) - cf->remote_ip = IPA_NONE; --- -GitLab - diff --git a/net-misc/bird/files/bird-3.0.0-rt-table.c.patch b/net-misc/bird/files/bird-3.0.0-rt-table.c.patch deleted file mode 100644 index 13a6a71..0000000 --- a/net-misc/bird/files/bird-3.0.0-rt-table.c.patch +++ /dev/null @@ -1,87 +0,0 @@ -From 2e14832d36c83b2ab5b7fb28b701de554fa5fdd9 Mon Sep 17 00:00:00 2001 -From: Maria Matejka -Date: Tue, 7 Jan 2025 12:13:57 +0100 -Subject: [PATCH] Table: old best route refeed fix - -When refeeding with RA_OPTIMAL, the old best routes weren't announced, -leading to weird behavior of protocols, mostly kernel. Fixed. ---- - nest/rt-table.c | 30 ++++++++++++++++++++++++++---- - 1 file changed, 26 insertions(+), 4 deletions(-) - -diff --git a/nest/rt-table.c b/nest/rt-table.c -index fc6d0d4e0..18a445a62 100644 ---- a/nest/rt-table.c -+++ b/nest/rt-table.c -@@ -1485,11 +1485,18 @@ channel_notify_basic(void *_channel) - rte *new = &u->feed->block[i]; - rte *old = NULL; - for (uint o = oldpos; o < u->feed->count_routes; o++) -- if (new->src == u->feed->block[o].src) -+ if ((c->ra_mode == RA_ANY) && (new->src == u->feed->block[o].src)) - { - old = &u->feed->block[o]; - break; - } -+ else if ((c->ra_mode == RA_OPTIMAL) && ( -+ bmap_test(&c->export_accepted_map, u->feed->block[o].id) || -+ bmap_test(&c->export_rejected_map, u->feed->block[o].id))) -+ { -+ ASSERT_DIE(!old); -+ old = &u->feed->block[o]; -+ } - - rt_notify_basic(c, new, old); - -@@ -2542,10 +2549,14 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool - last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire); - first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first); - -- uint ecnt = 0; -+ uint ecnt = 0, ocnt = 0; - for (const struct rt_pending_export *rpe = first; rpe; - rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) -+ { - ecnt++; -+ if (rpe->it.old) -+ ocnt++; -+ } - - if (ecnt) { - const net_addr *a = (first->it.new ?: first->it.old)->net; -@@ -2558,10 +2569,11 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool - if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net))) - return NULL; - -- struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt); -+ struct rt_export_feed *feed = rt_alloc_feed(!!best + ocnt, ecnt); -+ uint bpos = 0; - if (best) - { -- feed->block[0] = best->rte; -+ feed->block[bpos++] = best->rte; - feed->ni = NET_TO_INDEX(best->rte.net); - } - else -@@ -2575,8 +2587,18 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool - if (e >= ecnt) - RT_READ_RETRY(tr); - else -+ { - feed->exports[e++] = rpe->it.seq; -+ if (rpe->it.old) -+ { -+ ASSERT_DIE(bpos < !!best + ocnt); -+ feed->block[bpos] = *rpe->it.old; -+ feed->block[bpos].flags |= REF_OBSOLETE; -+ bpos++; -+ } -+ } - -+ ASSERT_DIE(bpos == !!best + ocnt); - ASSERT_DIE(e == ecnt); - } - --- -GitLab - diff --git a/net-misc/bird/files/bird-3.0.0_p20250107.patch b/net-misc/bird/files/bird-3.0.0_p20250107.patch new file mode 100644 index 0000000..004f599 --- /dev/null +++ b/net-misc/bird/files/bird-3.0.0_p20250107.patch @@ -0,0 +1,1758 @@ +diff --git a/lib/resource.h b/lib/resource.h +index 48bf1f9b..12b78851 100644 +--- a/lib/resource.h ++++ b/lib/resource.h +@@ -139,6 +139,20 @@ void *sl_allocz(slab *); + void sl_free(void *); + void sl_delete(slab *); + ++/* A whole stonehenge of slabs */ ++ ++typedef struct stonehenge stonehenge; ++typedef struct sth_block { ++ void *block; ++ bool large; ++} sth_block; ++ ++stonehenge *sth_new(pool *); ++sth_block sth_alloc(stonehenge *, uint size); ++sth_block sth_allocz(stonehenge *, uint size); ++void sth_free(sth_block); ++void sth_delete(stonehenge *); ++ + /* + * Low-level memory allocation functions, please don't use + * outside resource manager and possibly sysdep code. +diff --git a/lib/slab.c b/lib/slab.c +index ca971f9f..d68bfef1 100644 +--- a/lib/slab.c ++++ b/lib/slab.c +@@ -469,4 +469,66 @@ slab_lookup(resource *r, unsigned long a) + return NULL; + } + ++static const uint stonehenge_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 }; ++ ++struct stonehenge { ++ pool *p; ++ slab *s[ARRAY_SIZE(stonehenge_sizes)]; ++}; ++ ++sth_block ++sth_alloc(stonehenge *sth, uint size) ++{ ++ for (uint i=0; is[i]) ++ sth->s[i] = sl_new(sth->p, stonehenge_sizes[i]); ++ ++ return (sth_block) { .block = sl_alloc(sth->s[i]), }; ++ } ++ ++ return (sth_block) { ++ .block = mb_alloc(sth->p, size), ++ .large = 1, ++ }; ++} ++ ++sth_block ++sth_allocz(stonehenge *sth, uint size) ++{ ++ sth_block b = sth_alloc(sth, size); ++ bzero(b.block, size); ++ return b; ++} ++ ++void ++sth_free(sth_block b) ++{ ++ if (b.large) ++ mb_free(b.block); ++ else ++ sl_free(b.block); ++} ++ ++stonehenge * ++sth_new(pool *pp) ++{ ++ stonehenge tmps = { ++ .p = rp_new(pp, pp->domain, "Stonehenge"), ++ }; ++ ++ stonehenge *s = sth_alloc(&tmps, sizeof(stonehenge)).block; ++ *s = tmps; ++ return s; ++} ++ ++void sth_delete(stonehenge *s) ++{ ++ pool *p = s->p; ++ sth_free((sth_block) { s }); ++ rp_free(p); ++} ++ ++ + #endif +diff --git a/nest/bfd.h b/nest/bfd.h +index 5dacff5d..c046152f 100644 +--- a/nest/bfd.h ++++ b/nest/bfd.h +@@ -18,8 +18,11 @@ struct bfd_options { + u32 min_tx_int; + u32 idle_tx_int; + u8 multiplier; +- u8 passive; +- u8 passive_set; ++ PACKED enum bfd_opt_passive { ++ BFD_OPT_PASSIVE_UNKNOWN = 0, ++ BFD_OPT_PASSIVE, ++ BFD_OPT_NOT_PASSIVE, ++ } passive; + u8 mode; + u8 auth_type; /* Authentication type (BFD_AUTH_*) */ + list *passwords; /* Passwords for authentication */ +diff --git a/nest/cli.c b/nest/cli.c +index 3b8e6f46..b33ffd43 100644 +--- a/nest/cli.c ++++ b/nest/cli.c +@@ -81,13 +81,14 @@ cli_alloc_out(cli *c, int size) + o = c->tx_buf; + else + { +- o = mb_alloc(c->pool, sizeof(struct cli_out) + CLI_TX_BUF_SIZE); ++ o = alloc_page(); ++ c->tx_pending_count++; + if (c->tx_write) + c->tx_write->next = o; + else + c->tx_buf = o; + o->wpos = o->outpos = o->buf; +- o->end = o->buf + CLI_TX_BUF_SIZE; ++ o->end = (void *) o + page_size; + } + c->tx_write = o; + if (!c->tx_pos) +@@ -167,19 +168,18 @@ cli_hello(cli *c) + static void + cli_free_out(cli *c) + { +- struct cli_out *o, *p; ++ for (struct cli_out *o = c->tx_buf, *n; o; o = n) ++ { ++ n = o->next; ++ free_page(o); ++ c->tx_pending_count--; ++ } + +- if (o = c->tx_buf) +- { +- o->wpos = o->outpos = o->buf; +- while (p = o->next) +- { +- o->next = p->next; +- mb_free(p); +- } +- } ++ c->tx_buf = NULL; + c->tx_write = c->tx_pos = NULL; + c->async_msg_size = 0; ++ ++ ASSERT_DIE(c->tx_pending_count == 0); + } + + void +@@ -189,6 +189,38 @@ cli_written(cli *c) + ev_schedule(c->event); + } + ++/* A dummy resource to show and free memory pages allocated for pending TX */ ++struct cli_tx_resource { ++ resource r; ++ struct cli *c; ++}; ++ ++static void ++cli_tx_resource_free(resource *r) ++{ ++ cli_free_out(SKIP_BACK(struct cli_tx_resource, r, r)->c); ++} ++ ++static void ++cli_tx_resource_dump(struct dump_request *dreq UNUSED, resource *r UNUSED) {} ++ ++static struct resmem ++cli_tx_resource_memsize(resource *r) ++{ ++ return (struct resmem) { ++ .effective = SKIP_BACK(struct cli_tx_resource, r, r)->c->tx_pending_count * page_size, ++ .overhead = sizeof(struct cli_tx_resource), ++ }; ++} ++ ++static struct resclass cli_tx_resource_class = { ++ .name = "CLI TX buffers", ++ .size = sizeof (struct cli_tx_resource), ++ .free = cli_tx_resource_free, ++ .dump = cli_tx_resource_dump, ++ .memsize = cli_tx_resource_memsize, ++}; ++ + + static byte *cli_rh_pos; + static uint cli_rh_len; +@@ -272,7 +304,8 @@ cli * + cli_new(struct birdsock *sock, struct cli_config *cf) + { + pool *p = rp_new(cli_pool, the_bird_domain.the_bird, "CLI"); +- cli *c = mb_alloc(p, sizeof(cli)); ++ struct cli_tx_resource *ctr = ralloc(p, &cli_tx_resource_class); ++ cli *c = ctr->c = mb_alloc(p, sizeof(cli)); + + bzero(c, sizeof(cli)); + c->pool = p; +diff --git a/nest/cli.h b/nest/cli.h +index d86ec380..671be04d 100644 +--- a/nest/cli.h ++++ b/nest/cli.h +@@ -17,7 +17,6 @@ + #include "conf/conf.h" + + #define CLI_RX_BUF_SIZE 4096 +-#define CLI_TX_BUF_SIZE 4096 + #define CLI_MAX_ASYNC_QUEUE 4096 + + #define CLI_MSG_SIZE 500 +@@ -49,6 +48,7 @@ typedef struct cli { + uint log_mask; /* Mask of allowed message levels */ + uint log_threshold; /* When free < log_threshold, store only important messages */ + uint async_msg_size; /* Total size of async messages queued in tx_buf */ ++ uint tx_pending_count; /* How many blocks are pending */ + } cli; + + struct cli_config { +diff --git a/nest/proto.c b/nest/proto.c +index dded84f5..caf99829 100644 +--- a/nest/proto.c ++++ b/nest/proto.c +@@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list); + #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) + #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) + +-static timer *gr_wait_timer; +- +-#define GRS_NONE 0 +-#define GRS_INIT 1 +-#define GRS_ACTIVE 2 +-#define GRS_DONE 3 +- +-static int graceful_restart_state; +-static u32 graceful_restart_locks; ++static struct graceful_recovery_context _graceful_recovery_context; ++OBSREF(struct graceful_recovery_context) graceful_recovery_context; + + static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; + static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; +@@ -676,9 +669,11 @@ void channel_notify_basic(void *); + void channel_notify_accepted(void *); + void channel_notify_merged(void *); + +-static void ++void + channel_start_export(struct channel *c) + { ++ ASSERT_DIE(birdloop_inside(c->proto->loop)); ++ + if (rt_export_get_state(&c->out_req) != TES_DOWN) + bug("%s.%s: Attempted to start channel's already started export", c->proto->name, c->name); + +@@ -910,7 +905,7 @@ channel_do_stop(struct channel *c) + ev_postpone(&c->reimport_event); + + c->gr_wait = 0; +- if (c->gr_lock) ++ if (OBSREF_GET(c->gr_lock)) + channel_graceful_restart_unlock(c); + + CALL(c->class->shutdown, c); +@@ -1405,7 +1400,7 @@ proto_start(struct proto *p) + DBG("Kicking %s up\n", p->name); + PD(p, "Starting"); + +- if (graceful_restart_state == GRS_INIT) ++ if (OBSREF_GET(graceful_recovery_context)) + p->gr_recovery = 1; + + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) +@@ -1867,6 +1862,25 @@ proto_spawn(struct proto_config *cf, uint disabled) + return p; + } + ++bool ++proto_disable(struct proto *p) ++{ ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); ++ bool changed = !p->disabled; ++ p->disabled = 1; ++ proto_rethink_goal(p); ++ return changed; ++} ++ ++bool ++proto_enable(struct proto *p) ++{ ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); ++ bool changed = p->disabled; ++ p->disabled = 0; ++ proto_rethink_goal(p); ++ return changed; ++} + + /** + * DOC: Graceful restart recovery +@@ -1900,7 +1914,45 @@ proto_spawn(struct proto_config *cf, uint disabled) + * + */ + +-static void graceful_restart_done(timer *t); ++/** ++ * graceful_restart_done - finalize graceful restart ++ * @t: unused ++ * ++ * When there are no locks on graceful restart, the functions finalizes the ++ * graceful restart recovery. Protocols postponing route export until the end of ++ * the recovery are awakened and the export to them is enabled. ++ */ ++static void ++graceful_recovery_done(struct callback *_ UNUSED) ++{ ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); ++ ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE); ++ ++ tm_stop(&_graceful_recovery_context.wait_timer); ++ log(L_INFO "Graceful recovery done"); ++ ++ WALK_TLIST(proto, p, &global_proto_list) ++ PROTO_LOCKED_FROM_MAIN(p) ++ { ++ p->gr_recovery = 0; ++ ++ struct channel *c; ++ WALK_LIST(c, p->channels) ++ { ++ ASSERT_DIE(!OBSREF_GET(c->gr_lock)); ++ ++ /* Resume postponed export of routes */ ++ if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) ++ channel_start_export(c); ++ ++ /* Cleanup */ ++ c->gr_wait = 0; ++ } ++ } ++ ++ _graceful_recovery_context.grc_state = GRS_DONE; ++} ++ + + /** + * graceful_restart_recovery - request initial graceful restart recovery +@@ -1912,7 +1964,30 @@ static void graceful_restart_done(timer *t); + void + graceful_restart_recovery(void) + { +- graceful_restart_state = GRS_INIT; ++ obstacle_target_init( ++ &_graceful_recovery_context.obstacles, ++ &_graceful_recovery_context.obstacles_cleared, ++ &root_pool, "Graceful recovery"); ++ ++ OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context); ++ _graceful_recovery_context.grc_state = GRS_INIT; ++} ++ ++static void ++graceful_recovery_timeout(timer *t UNUSED) ++{ ++ log(L_INFO "Graceful recovery timeout"); ++ WALK_TLIST(proto, p, &global_proto_list) ++ PROTO_LOCKED_FROM_MAIN(p) ++ { ++ struct channel *c; ++ WALK_LIST(c, p->channels) ++ if (OBSREF_GET(c->gr_lock)) ++ { ++ log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name); ++ OBSREF_CLEAR(c->gr_lock); ++ } ++ } + } + + /** +@@ -1925,73 +2000,35 @@ graceful_restart_recovery(void) + void + graceful_restart_init(void) + { +- if (!graceful_restart_state) ++ if (!OBSREF_GET(graceful_recovery_context)) + return; + +- log(L_INFO "Graceful restart started"); ++ log(L_INFO "Graceful recovery started"); + +- if (!graceful_restart_locks) +- { +- graceful_restart_done(NULL); +- return; +- } ++ _graceful_recovery_context.grc_state = GRS_ACTIVE; + +- graceful_restart_state = GRS_ACTIVE; +- gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0); ++ _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout }; + u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait; +- tm_start(gr_wait_timer, gr_wait S); +-} ++ tm_start(&_graceful_recovery_context.wait_timer, gr_wait S); + +-/** +- * graceful_restart_done - finalize graceful restart +- * @t: unused +- * +- * When there are no locks on graceful restart, the functions finalizes the +- * graceful restart recovery. Protocols postponing route export until the end of +- * the recovery are awakened and the export to them is enabled. All other +- * related state is cleared. The function is also called when the graceful +- * restart wait timer fires (but there are still some locks). +- */ +-static void +-graceful_restart_done(timer *t) +-{ +- log(L_INFO "Graceful restart done"); +- graceful_restart_state = GRS_DONE; ++ callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop); + +- WALK_TLIST(proto, p, &global_proto_list) +- { +- if (!p->gr_recovery) +- continue; +- +- struct channel *c; +- WALK_LIST(c, p->channels) +- { +- /* Resume postponed export of routes */ +- if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) +- channel_start_export(c); +- +- /* Cleanup */ +- c->gr_wait = 0; +- c->gr_lock = 0; +- } +- +- p->gr_recovery = 0; +- } +- +- graceful_restart_locks = 0; +- +- rfree(t); ++ /* The last clearing of obstacle reference will cause ++ * the graceful recovery finish immediately. */ ++ OBSREF_CLEAR(graceful_recovery_context); + } + + void + graceful_restart_show_status(void) + { +- if (graceful_restart_state != GRS_ACTIVE) ++ if (_graceful_recovery_context.grc_state != GRS_ACTIVE) + return; + + cli_msg(-24, "Graceful restart recovery in progress"); +- cli_msg(-24, " Waiting for %d channels to recover", graceful_restart_locks); +- cli_msg(-24, " Wait timer is %t/%u", tm_remains(gr_wait_timer), ++ cli_msg(-24, " Waiting for %u channels to recover", ++ obstacle_target_count(&_graceful_recovery_context.obstacles)); ++ cli_msg(-24, " Wait timer is %t/%u", ++ tm_remains(&_graceful_recovery_context.wait_timer), + atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait); + } + +@@ -2011,14 +2048,22 @@ graceful_restart_show_status(void) + void + channel_graceful_restart_lock(struct channel *c) + { +- ASSERT(graceful_restart_state == GRS_INIT); +- ASSERT(c->proto->gr_recovery); ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); + +- if (c->gr_lock) ++ if (OBSREF_GET(c->gr_lock)) + return; + +- c->gr_lock = 1; +- graceful_restart_locks++; ++ switch (_graceful_recovery_context.grc_state) ++ { ++ case GRS_INIT: ++ case GRS_ACTIVE: ++ OBSREF_SET(c->gr_lock, &_graceful_recovery_context); ++ break; ++ ++ case GRS_NONE: ++ case GRS_DONE: ++ break; ++ } + } + + /** +@@ -2031,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c) + void + channel_graceful_restart_unlock(struct channel *c) + { +- if (!c->gr_lock) +- return; +- +- c->gr_lock = 0; +- graceful_restart_locks--; +- +- if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks) +- tm_start(gr_wait_timer, 0); ++ OBSREF_CLEAR(c->gr_lock); + } + + +- + /** + * protos_dump_all - dump status of all protocols + * +@@ -2594,9 +2631,9 @@ channel_show_info(struct channel *c) + cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter)); + cli_msg(-1006, " Output filter: %s", filter_name(c->out_filter)); + +- if (graceful_restart_state == GRS_ACTIVE) ++ if (_graceful_recovery_context.grc_state == GRS_ACTIVE) + cli_msg(-1006, " GR recovery: %s%s", +- c->gr_lock ? " pending" : "", ++ OBSREF_GET(c->gr_lock) ? " pending" : "", + c->gr_wait ? " waiting" : ""); + + channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]); +diff --git a/nest/protocol.h b/nest/protocol.h +index 25ed6f55..ec561b26 100644 +--- a/nest/protocol.h ++++ b/nest/protocol.h +@@ -78,6 +78,8 @@ void proto_build(struct protocol *); /* Called from protocol to register itself + void protos_preconfig(struct config *); + void protos_commit(struct config *new, struct config *old, int type); + struct proto * proto_spawn(struct proto_config *cf, uint disabled); ++bool proto_disable(struct proto *p); ++bool proto_enable(struct proto *p); + void protos_dump_all(struct dump_request *); + + #define GA_UNKNOWN 0 /* Attribute not recognized */ +@@ -657,7 +659,7 @@ struct channel { + + u8 channel_state; + u8 reloadable; /* Hook reload_routes() is allowed on the channel */ +- u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ ++ OBSREF(struct graceful_recovery_context) gr_lock; /* Graceful restart mechanism should wait for this channel */ + u8 gr_wait; /* Route export to channel is postponed until graceful restart */ + + u32 obstacles; /* External obstacles remaining before cleanup */ +@@ -745,6 +747,8 @@ int proto_configure_channel(struct proto *p, struct channel **c, struct channel_ + + void channel_set_state(struct channel *c, uint state); + ++void channel_start_export(struct channel *c); ++ + void channel_add_obstacle(struct channel *c); + void channel_del_obstacle(struct channel *c); + +@@ -759,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint + void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); + int channel_reconfigure(struct channel *c, struct channel_config *cf); + ++struct graceful_recovery_context { ++ struct obstacle_target obstacles; ++ struct callback obstacles_cleared; ++ enum { ++ GRS_NONE, ++ GRS_INIT, ++ GRS_ACTIVE, ++ GRS_DONE, ++ } grc_state; ++ timer wait_timer; ++}; ++ + #endif +diff --git a/nest/rt-attr.c b/nest/rt-attr.c +index a0f7d571..9d5e1098 100644 +--- a/nest/rt-attr.c ++++ b/nest/rt-attr.c +@@ -204,9 +204,7 @@ DOMAIN(attrs) attrs_domain; + + pool *rta_pool; + +-/* Assuming page size of 4096, these are magic values for slab allocation */ +-static const uint ea_slab_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 }; +-static slab *ea_slab[ARRAY_SIZE(ea_slab_sizes)]; ++static stonehenge *ea_sth; + + static slab *rte_src_slab; + +@@ -969,8 +967,8 @@ ea_list_size(ea_list *o) + * and creates the final structure useful for storage or fast searching. + * The method is a bucket sort. + * +- * Returns the final ea_list with some excess memory at the end, +- * allocated from the tmp_linpool. The adata is linked from the original places. ++ * Returns the final ea_list allocated from the tmp_linpool. ++ * The adata is linked from the original places. + */ + ea_list * + ea_normalize(ea_list *e, u32 upto) +@@ -978,21 +976,17 @@ ea_normalize(ea_list *e, u32 upto) + /* We expect some work to be actually needed. */ + ASSERT_DIE(!BIT32_TEST(&upto, e->stored)); + +- /* Allocate the output */ +- ea_list *out = tmp_allocz(ea_class_max * sizeof(eattr) + sizeof(ea_list)); +- *out = (ea_list) { +- .flags = EALF_SORTED, +- }; +- ++ /* Allocate the buckets locally */ ++ eattr *buckets = allocz(ea_class_max * sizeof(eattr)); + uint min_id = ~0, max_id = 0; + +- eattr *buckets = out->attrs; ++ ea_list *next = NULL; + + /* Walk the attribute lists, one after another. */ + for (; e; e = e->next) + { +- if (!out->next && BIT32_TEST(&upto, e->stored)) +- out->next = e; ++ if (!next && BIT32_TEST(&upto, e->stored)) ++ next = e; + + for (int i = 0; i < e->count; i++) + { +@@ -1002,7 +996,7 @@ ea_normalize(ea_list *e, u32 upto) + if (id < min_id) + min_id = id; + +- if (out->next) ++ if (next) + { + /* Underlay: check whether the value is duplicate */ + if (buckets[id].id && buckets[id].fresh) +@@ -1028,6 +1022,18 @@ ea_normalize(ea_list *e, u32 upto) + } + } + ++ /* Find out how big the output actually is. */ ++ uint len = 0; ++ for (uint id = min_id; id <= max_id; id++) ++ if (buckets[id].id && !(buckets[id].undef && buckets[id].fresh)) ++ len++; ++ ++ ea_list *out = tmp_alloc(sizeof(ea_list) + len * sizeof(eattr)); ++ *out = (ea_list) { ++ .flags = EALF_SORTED, ++ .next = next, ++ }; ++ + /* And now we just walk the list from beginning to end and collect + * everything to the beginning of the list. + * Walking just that part which is inhabited for sure. */ +@@ -1046,9 +1052,12 @@ ea_normalize(ea_list *e, u32 upto) + + /* Move the attribute to the beginning */ + ASSERT_DIE(out->count < id); +- buckets[out->count++] = buckets[id]; ++ ASSERT_DIE(out->count < len); ++ out->attrs[out->count++] = buckets[id]; + } + ++ ASSERT_DIE(out->count == len); ++ + /* We want to bisect only if the list is long enough */ + if (out->count > 5) + out->flags |= EALF_BISECT; +@@ -1583,24 +1592,18 @@ ea_lookup_slow(ea_list *o, u32 squash_upto, enum ea_stored oid) + return rr; + } + +- struct ea_storage *r = NULL; + uint elen = ea_list_size(o); + uint sz = elen + sizeof(struct ea_storage); +- for (uint i=0; il, o, elen); + ea_list_ref(r->l); + +- r->l->flags |= huge; ++ if (b.large) ++ r->l->flags |= EALF_HUGE; ++ + r->l->stored = oid; + r->hash_key = h; + atomic_store_explicit(&r->uc, 1, memory_order_release); +@@ -1668,10 +1671,7 @@ ea_free_deferred(struct deferred_call *dc) + + /* And now we can free the object, finally */ + ea_list_unref(r->l); +- if (r->l->flags & EALF_HUGE) +- mb_free(r); +- else +- sl_free(r); ++ sth_free((sth_block) { r, !!(r->l->flags & EALF_HUGE) }); + + RTA_UNLOCK; + } +@@ -1722,9 +1722,7 @@ rta_init(void) + RTA_LOCK; + rta_pool = rp_new(&root_pool, attrs_domain.attrs, "Attributes"); + +- for (uint i=0; inext = f->feed_pending; +- f->feed_pending = rfr; ++ if (f->feeding) ++ { ++ rfr->next = f->feed_pending; ++ f->feed_pending = rfr; ++ } ++ else ++ { ++ rfr->next = NULL; ++ f->feeding = rfr; ++ } + } + + void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr) +diff --git a/nest/rt-show.c b/nest/rt-show.c +index 3986da83..aa9209ca 100644 +--- a/nest/rt-show.c ++++ b/nest/rt-show.c +@@ -282,8 +282,9 @@ rt_show_cont(struct cli *c) + rt_show_table(d); + + RT_FEED_WALK(&d->tab->req, f) +- if (f->count_routes) +- rt_show_net(d, f); ++ TMP_SAVED ++ if (f->count_routes) ++ rt_show_net(d, f); + + if (rt_export_feed_active(&d->tab->req)) + rt_feeder_unsubscribe(&d->tab->req); +diff --git a/nest/rt-table.c b/nest/rt-table.c +index fd8bb50d..18a445a6 100644 +--- a/nest/rt-table.c ++++ b/nest/rt-table.c +@@ -1485,11 +1485,18 @@ channel_notify_basic(void *_channel) + rte *new = &u->feed->block[i]; + rte *old = NULL; + for (uint o = oldpos; o < u->feed->count_routes; o++) +- if (new->src == u->feed->block[o].src) ++ if ((c->ra_mode == RA_ANY) && (new->src == u->feed->block[o].src)) + { + old = &u->feed->block[o]; + break; + } ++ else if ((c->ra_mode == RA_OPTIMAL) && ( ++ bmap_test(&c->export_accepted_map, u->feed->block[o].id) || ++ bmap_test(&c->export_rejected_map, u->feed->block[o].id))) ++ { ++ ASSERT_DIE(!old); ++ old = &u->feed->block[o]; ++ } + + rt_notify_basic(c, new, old); + +@@ -2024,13 +2031,23 @@ rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct n + do_recalculate: + /* Add the new route to the list right behind the old one */ + if (new_stored) ++ { ++ /* There is the same piece of code several lines farther. Needs refactoring. ++ * The old_stored check is needed because of the possible jump from deterministic med */ ++ if (old_stored) + { + atomic_store_explicit(&new_stored->next, atomic_load_explicit(&old_stored->next, memory_order_relaxed), memory_order_release); + atomic_store_explicit(&old_stored->next, new_stored, memory_order_release); +- +- table->rt_count++; ++ } ++ else ++ { ++ atomic_store_explicit(&new_stored->next, NULL, memory_order_release); ++ atomic_store_explicit(last_ptr, new_stored, memory_order_release); + } + ++ table->rt_count++; ++ } ++ + /* Find a new optimal route (if there is any) */ + struct rte_storage * _Atomic *bp = &local_sentinel.next; + struct rte_storage *best = atomic_load_explicit(bp, memory_order_relaxed); +@@ -2532,10 +2549,14 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool + last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire); + first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first); + +- uint ecnt = 0; ++ uint ecnt = 0, ocnt = 0; + for (const struct rt_pending_export *rpe = first; rpe; + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) ++ { + ecnt++; ++ if (rpe->it.old) ++ ocnt++; ++ } + + if (ecnt) { + const net_addr *a = (first->it.new ?: first->it.old)->net; +@@ -2548,10 +2569,11 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool + if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net))) + return NULL; + +- struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt); ++ struct rt_export_feed *feed = rt_alloc_feed(!!best + ocnt, ecnt); ++ uint bpos = 0; + if (best) + { +- feed->block[0] = best->rte; ++ feed->block[bpos++] = best->rte; + feed->ni = NET_TO_INDEX(best->rte.net); + } + else +@@ -2565,8 +2587,18 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool + if (e >= ecnt) + RT_READ_RETRY(tr); + else ++ { + feed->exports[e++] = rpe->it.seq; ++ if (rpe->it.old) ++ { ++ ASSERT_DIE(bpos < !!best + ocnt); ++ feed->block[bpos] = *rpe->it.old; ++ feed->block[bpos].flags |= REF_OBSOLETE; ++ bpos++; ++ } ++ } + ++ ASSERT_DIE(bpos == !!best + ocnt); + ASSERT_DIE(e == ecnt); + } + +@@ -5265,14 +5297,14 @@ krt_export_net(struct channel *c, const net_addr *a, linpool *lp) + if (c->ra_mode == RA_MERGED) + { + struct rt_export_feed *feed = rt_net_feed(c->table, a, NULL); +- if (!feed->count_routes) ++ if (!feed || !feed->count_routes) + return NULL; + + if (!bmap_test(&c->export_accepted_map, feed->block[0].id)) + return NULL; + + return rt_export_merged(c, feed, lp, 1); +- } ++ } + + static _Thread_local rte best; + best = rt_net_best(c->table, a); +diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c +index 34f992b9..4997f803 100644 +--- a/proto/bfd/bfd.c ++++ b/proto/bfd/bfd.c +@@ -172,17 +172,17 @@ static void bfd_free_iface(struct bfd_iface *ifa); + * BFD sessions + */ + +-static inline struct bfd_session_config +-bfd_merge_options(const struct bfd_iface_config *cf, const struct bfd_options *opts) ++static inline struct bfd_options ++bfd_merge_options(const struct bfd_options *bottom, const struct bfd_options *top) + { +- return (struct bfd_session_config) { +- .min_rx_int = opts->min_rx_int ?: cf->min_rx_int, +- .min_tx_int = opts->min_tx_int ?: cf->min_tx_int, +- .idle_tx_int = opts->idle_tx_int ?: cf->idle_tx_int, +- .multiplier = opts->multiplier ?: cf->multiplier, +- .passive = opts->passive_set ? opts->passive : cf->passive, +- .auth_type = opts->auth_type ?: cf->auth_type, +- .passwords = opts->passwords ?: cf->passwords, ++ return (struct bfd_options) { ++ .min_rx_int = top->min_rx_int ?: bottom->min_rx_int, ++ .min_tx_int = top->min_tx_int ?: bottom->min_tx_int, ++ .idle_tx_int = top->idle_tx_int ?: bottom->idle_tx_int, ++ .multiplier = top->multiplier ?: bottom->multiplier, ++ .passive = top->passive ?: bottom->passive, ++ .auth_type = top->auth_type ?: bottom->auth_type, ++ .passwords = top->passwords ?: bottom->passwords, + }; + } + +@@ -478,7 +478,7 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * + HASH_INSERT(p->session_hash_id, HASH_ID, s); + HASH_INSERT(p->session_hash_ip, HASH_IP, s); + +- s->cf = bfd_merge_options(ifa->cf, opts); ++ s->cf = bfd_merge_options(&ifa->cf->opts, opts); + + /* Initialization of state variables - see RFC 5880 6.8.1 */ + s->loc_state = BFD_STATE_DOWN; +@@ -561,26 +561,58 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) + birdloop_leave(p->p.loop); + } + ++struct bfd_reconfigure_sessions_deferred_call { ++ struct deferred_call dc; ++ struct bfd_proto *p; ++ config_ref old_config; ++}; ++ + static void +-bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) ++bfd_reconfigure_sessions(struct deferred_call *dc) + { +- if (EMPTY_LIST(s->request_list)) +- return; ++ SKIP_BACK_DECLARE(struct bfd_reconfigure_sessions_deferred_call, ++ brsdc, dc, dc); + +- ASSERT_DIE(birdloop_inside(p->p.loop)); ++ struct bfd_proto *p = brsdc->p; ++ birdloop_enter(p->p.loop); + +- SKIP_BACK_DECLARE(struct bfd_request, req, n, HEAD(s->request_list)); +- s->cf = bfd_merge_options(s->ifa->cf, &req->opts); ++ HASH_WALK(p->session_hash_id, next_id, s) ++ { ++ if (!EMPTY_LIST(s->request_list)) ++ { ++ SKIP_BACK_DECLARE(struct bfd_request, req, n, HEAD(s->request_list)); ++ struct bfd_options opts = bfd_merge_options(&s->ifa->cf->opts, &req->opts); + +- u32 tx = (s->loc_state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int; +- bfd_session_set_min_tx(s, tx); +- bfd_session_set_min_rx(s, s->cf.min_rx_int); +- s->detect_mult = s->cf.multiplier; +- s->passive = s->cf.passive; ++#define CHK(x) (opts.x != s->cf.x) || ++ bool reload = MACRO_FOREACH(CHK, ++ min_rx_int, ++ min_tx_int, ++ idle_tx_int, ++ multiplier, ++ passive) false; /* terminating the || chain */ ++#undef CHK + +- bfd_session_control_tx_timer(s, 0); ++ s->cf = opts; ++ ++ if (reload) ++ { ++ u32 tx = (s->loc_state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int; ++ bfd_session_set_min_tx(s, tx); ++ bfd_session_set_min_rx(s, s->cf.min_rx_int); ++ s->detect_mult = s->cf.multiplier; ++ s->passive = s->cf.passive; ++ ++ bfd_session_control_tx_timer(s, 0); ++ ++ TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); ++ } ++ } ++ } ++ HASH_WALK_END; ++ birdloop_leave(p->p.loop); + +- TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); ++ /* Now the config is clean */ ++ OBSREF_CLEAR(brsdc->old_config); + } + + +@@ -589,10 +621,12 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) + */ + + static struct bfd_iface_config bfd_default_iface = { +- .min_rx_int = BFD_DEFAULT_MIN_RX_INT, +- .min_tx_int = BFD_DEFAULT_MIN_TX_INT, +- .idle_tx_int = BFD_DEFAULT_IDLE_TX_INT, +- .multiplier = BFD_DEFAULT_MULTIPLIER, ++ .opts = { ++ .min_rx_int = BFD_DEFAULT_MIN_RX_INT, ++ .min_tx_int = BFD_DEFAULT_MIN_TX_INT, ++ .idle_tx_int = BFD_DEFAULT_IDLE_TX_INT, ++ .multiplier = BFD_DEFAULT_MULTIPLIER, ++ }, + }; + + static inline struct bfd_iface_config * +@@ -650,24 +684,6 @@ bfd_free_iface(struct bfd_iface *ifa) + mb_free(ifa); + } + +-static void +-bfd_reconfigure_iface(struct bfd_proto *p UNUSED, struct bfd_iface *ifa, struct bfd_config *nc) +-{ +- struct bfd_iface_config *new = bfd_find_iface_config(nc, ifa->iface); +- struct bfd_iface_config *old = ifa->cf; +- +- /* Check options that are handled in bfd_reconfigure_session() */ +- ifa->changed = +- (new->min_rx_int != old->min_rx_int) || +- (new->min_tx_int != old->min_tx_int) || +- (new->idle_tx_int != old->idle_tx_int) || +- (new->multiplier != old->multiplier) || +- (new->passive != old->passive); +- +- /* This should be probably changed to not access ifa->cf from the BFD thread */ +- ifa->cf = new; +-} +- + + /* + * BFD requests +@@ -900,20 +916,7 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, + void + bfd_update_request(struct bfd_request *req, const struct bfd_options *opts) + { +- struct bfd_session *s = req->session; +- +- if (!memcmp(opts, &req->opts, sizeof(const struct bfd_options))) +- return; +- + req->opts = *opts; +- +- if (s) +- { +- struct bfd_proto *p = s->ifa->bfd; +- birdloop_enter(p->p.loop); +- bfd_reconfigure_session(p, s); +- birdloop_leave(p->p.loop); +- } + } + + static void +@@ -1193,21 +1196,22 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) + (new->zero_udp6_checksum_rx != old->zero_udp6_checksum_rx)) + return 0; + +- birdloop_mask_wakeups(p->p.loop); +- + WALK_LIST(ifa, p->iface_list) +- bfd_reconfigure_iface(p, ifa, new); +- +- HASH_WALK(p->session_hash_id, next_id, s) +- { +- if (s->ifa->changed) +- bfd_reconfigure_session(p, s); +- } +- HASH_WALK_END; ++ ifa->cf = bfd_find_iface_config(new, ifa->iface); + + bfd_reconfigure_neighbors(p, new); + +- birdloop_unmask_wakeups(p->p.loop); ++ /* Sessions get reconfigured after all the config is applied */ ++ struct bfd_reconfigure_sessions_deferred_call brsdc = { ++ .dc.hook = bfd_reconfigure_sessions, ++ .p = p, ++ }; ++ SKIP_BACK_DECLARE(struct bfd_reconfigure_sessions_deferred_call, ++ brsdcp, dc, defer_call(&brsdc.dc, sizeof brsdc)); ++ ++ /* We need to keep the old config alive until all the sessions get ++ * reconfigured */ ++ OBSREF_SET(brsdcp->old_config, P->cf->global); + + return 1; + } +diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h +index 578ce875..107829b7 100644 +--- a/proto/bfd/bfd.h ++++ b/proto/bfd/bfd.h +@@ -54,24 +54,7 @@ struct bfd_config + struct bfd_iface_config + { + struct iface_patt i; +- u32 min_rx_int; +- u32 min_tx_int; +- u32 idle_tx_int; +- u8 multiplier; +- u8 passive; +- u8 auth_type; /* Authentication type (BFD_AUTH_*) */ +- list *passwords; /* Passwords for authentication */ +-}; +- +-struct bfd_session_config +-{ +- u32 min_rx_int; +- u32 min_tx_int; +- u32 idle_tx_int; +- u8 multiplier; +- u8 passive; +- u8 auth_type; /* Authentication type (BFD_AUTH_*) */ +- list *passwords; /* Passwords for authentication */ ++ struct bfd_options opts; + }; + + struct bfd_neighbor +@@ -146,7 +129,7 @@ struct bfd_session + u32 loc_id; /* Local session ID (local discriminator) */ + u32 rem_id; /* Remote session ID (remote discriminator) */ + +- struct bfd_session_config cf; /* Static configuration parameters */ ++ struct bfd_options cf; /* Static configuration parameters */ + + u32 des_min_tx_int; /* Desired min rx interval, local option */ + u32 des_min_tx_new; /* Used for des_min_tx_int change */ +diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y +index 9e9919c4..56d1ffac 100644 +--- a/proto/bfd/config.Y ++++ b/proto/bfd/config.Y +@@ -86,44 +86,37 @@ bfd_iface_start: + add_tail(&BFD_CFG->patt_list, NODE this_ipatt); + init_list(&this_ipatt->ipn_list); + +- BFD_IFACE->min_rx_int = BFD_DEFAULT_MIN_RX_INT; +- BFD_IFACE->min_tx_int = BFD_DEFAULT_MIN_TX_INT; +- BFD_IFACE->idle_tx_int = BFD_DEFAULT_IDLE_TX_INT; +- BFD_IFACE->multiplier = BFD_DEFAULT_MULTIPLIER; ++ this_bfd_opts = &BFD_IFACE->opts; ++ ++ this_bfd_opts->min_rx_int = BFD_DEFAULT_MIN_RX_INT; ++ this_bfd_opts->min_tx_int = BFD_DEFAULT_MIN_TX_INT; ++ this_bfd_opts->idle_tx_int = BFD_DEFAULT_IDLE_TX_INT; ++ this_bfd_opts->multiplier = BFD_DEFAULT_MULTIPLIER; + + reset_passwords(); + }; + + bfd_iface_finish: + { +- BFD_IFACE->passwords = get_passwords(); ++ this_bfd_opts->passwords = get_passwords(); + +- if (!BFD_IFACE->auth_type != !BFD_IFACE->passwords) ++ if (!this_bfd_opts->auth_type != !this_bfd_opts->passwords) + cf_warn("Authentication and password options should be used together"); + +- if (BFD_IFACE->passwords) ++ if (this_bfd_opts->passwords) + { + struct password_item *pass; +- WALK_LIST(pass, *BFD_IFACE->passwords) ++ WALK_LIST(pass, *this_bfd_opts->passwords) + { + if (pass->alg) + cf_error("Password algorithm option not available in BFD protocol"); + +- pass->alg = bfd_auth_type_to_hash_alg[BFD_IFACE->auth_type]; ++ pass->alg = bfd_auth_type_to_hash_alg[this_bfd_opts->auth_type]; + } + } +-}; + +-bfd_iface_item: +- INTERVAL expr_us { BFD_IFACE->min_rx_int = BFD_IFACE->min_tx_int = $2; } +- | MIN RX INTERVAL expr_us { BFD_IFACE->min_rx_int = $4; } +- | MIN TX INTERVAL expr_us { BFD_IFACE->min_tx_int = $4; } +- | IDLE TX INTERVAL expr_us { BFD_IFACE->idle_tx_int = $4; } +- | MULTIPLIER expr { BFD_IFACE->multiplier = $2; } +- | PASSIVE bool { BFD_IFACE->passive = $2; } +- | AUTHENTICATION bfd_auth_type { BFD_IFACE->auth_type = $2; } +- | password_list {} +- ; ++ this_bfd_opts = NULL; ++}; + + bfd_auth_type: + NONE { $$ = BFD_AUTH_NONE; } +@@ -134,14 +127,9 @@ bfd_auth_type: + | METICULOUS KEYED SHA1 { $$ = BFD_AUTH_METICULOUS_KEYED_SHA1; } + ; + +-bfd_iface_opts: +- /* empty */ +- | bfd_iface_opts bfd_iface_item ';' +- ; +- + bfd_iface_opt_list: + /* empty */ +- | '{' bfd_iface_opts '}' ++ | '{' bfd_items '}' + ; + + bfd_iface: +@@ -194,7 +182,7 @@ bfd_item: + | MIN TX INTERVAL expr_us { this_bfd_opts->min_tx_int = $4; } + | IDLE TX INTERVAL expr_us { this_bfd_opts->idle_tx_int = $4; } + | MULTIPLIER expr { this_bfd_opts->multiplier = $2; } +- | PASSIVE bool { this_bfd_opts->passive = $2; this_bfd_opts->passive_set = 1; } ++ | PASSIVE bool { this_bfd_opts->passive = $2 ? BFD_OPT_PASSIVE : BFD_OPT_NOT_PASSIVE; } + | GRACEFUL { this_bfd_opts->mode = BGP_BFD_GRACEFUL; } + | AUTHENTICATION bfd_auth_type { this_bfd_opts->auth_type = $2; } + | password_list {} +diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c +index 1ceb470c..f8bd63d7 100644 +--- a/proto/bfd/packets.c ++++ b/proto/bfd/packets.c +@@ -109,7 +109,7 @@ const u8 bfd_auth_type_to_hash_alg[] = { + static void + bfd_fill_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_ctl_packet *pkt) + { +- struct bfd_session_config *cf = &s->cf; ++ struct bfd_options *cf = &s->cf; + struct password_item *pass = password_find(cf->passwords, 0); + uint meticulous = 0; + +@@ -179,7 +179,7 @@ bfd_fill_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_c + static int + bfd_check_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_ctl_packet *pkt) + { +- struct bfd_session_config *cf = &s->cf; ++ struct bfd_options *cf = &s->cf; + const char *err_dsc = NULL; + uint err_val = 0; + uint auth_type = 0; +diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c +index a2feaef5..db654234 100644 +--- a/proto/bgp/attrs.c ++++ b/proto/bgp/attrs.c +@@ -1192,7 +1192,7 @@ static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { + .decode = bgp_decode_large_community, + }, + [BA_ONLY_TO_CUSTOMER] = { +- .name = "otc", ++ .name = "bgp_otc", + .type = T_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, +@@ -1734,13 +1734,16 @@ bgp_get_bucket(struct bgp_ptx_private *c, ea_list *new) + uint size = sizeof(struct bgp_bucket) + ea_size; + + /* Allocate the bucket */ +- b = mb_alloc(c->pool, size); ++ sth_block blk = sth_alloc(c->sth, size); ++ b = blk.block; + *b = (struct bgp_bucket) { }; + init_list(&b->prefixes); + b->hash = hash; + + /* Copy the ea_list */ + ea_list_copy(b->eattrs, new, ea_size); ++ if (blk.large) ++ b->eattrs->flags |= EALF_HUGE; + + /* Insert the bucket to bucket hash */ + HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); +@@ -1764,7 +1767,7 @@ static void + bgp_free_bucket(struct bgp_ptx_private *c, struct bgp_bucket *b) + { + HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); +- mb_free(b); ++ sth_free((sth_block) { b, !!(b->eattrs->flags & EALF_HUGE) }); + } + + int +@@ -2086,6 +2089,7 @@ bgp_init_pending_tx(struct bgp_channel *c) + + bpp->lock = dom; + bpp->pool = p; ++ bpp->sth = sth_new(p); + bpp->c = c; + + bgp_init_bucket_table(bpp); +@@ -2160,8 +2164,7 @@ bgp_free_pending_tx(struct bgp_channel *bc) + HASH_WALK_END; + + HASH_FREE(c->bucket_hash); +- sl_delete(c->bucket_slab); +- c->bucket_slab = NULL; ++ sth_delete(c->sth); + + rp_free(c->pool); + +@@ -2686,10 +2689,10 @@ bgp_rte_recalculate(struct rtable_private *table, net *net, + struct rte_storage *new_stored, struct rte_storage *old_stored, struct rte_storage *old_best_stored) + { + struct rte_storage *key_stored = new_stored ? new_stored : old_stored; +- const struct rte *new = &new_stored->rte, +- *old = &old_stored->rte, +- *old_best = &old_best_stored->rte, +- *key = &key_stored->rte; ++ const struct rte *new = RTE_OR_NULL(new_stored), ++ *old = RTE_OR_NULL(old_stored), ++ *old_best = RTE_OR_NULL(old_best_stored), ++ *key = RTE_OR_NULL(key_stored); + + u32 lpref = rt_get_preference(key); + u32 lasn = bgp_get_neighbor(key); +diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c +index 5fc2b5ff..3170e3a4 100644 +--- a/proto/bgp/bgp.c ++++ b/proto/bgp/bgp.c +@@ -378,8 +378,6 @@ bgp_startup(struct bgp_proto *p) + if (p->postponed_sk) + { + /* Apply postponed incoming connection */ +- sk_reloop(p->postponed_sk, p->p.loop); +- + bgp_setup_conn(p, &p->incoming_conn); + bgp_setup_sk(&p->incoming_conn, p->postponed_sk); + bgp_send_open(&p->incoming_conn); +@@ -583,6 +581,9 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len + static void + bgp_down(struct bgp_proto *p) + { ++ /* Check that the dynamic BGP socket has been picked up */ ++ ASSERT_DIE(p->postponed_sk == NULL); ++ + if (bgp_start_state(p) > BSS_PREPARE) + { + bgp_setup_auth(p, 0); +@@ -617,8 +618,8 @@ bgp_decision(void *vp) + bgp_down(p); + } + +-static struct bgp_proto * +-bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) ++static void ++bgp_spawn(struct bgp_proto *pp, struct birdsock *sk) + { + struct symbol *sym; + char fmt[SYM_MAX_LEN]; +@@ -635,9 +636,16 @@ bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip) + cfg_mem = NULL; + + /* Just pass remote_ip to bgp_init() */ +- ((struct bgp_config *) sym->proto)->remote_ip = remote_ip; ++ ((struct bgp_config *) sym->proto)->remote_ip = sk->daddr; ++ ++ /* Create the protocol disabled initially */ ++ SKIP_BACK_DECLARE(struct bgp_proto, p, p, proto_spawn(sym->proto, 1)); + +- return (void *) proto_spawn(sym->proto, 0); ++ /* Pass the socket */ ++ p->postponed_sk = sk; ++ ++ /* And enable the protocol */ ++ proto_enable(&p->p); + } + + void +@@ -1489,10 +1497,15 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) + /* For dynamic BGP, spawn new instance and postpone the socket */ + if (bgp_is_dynamic(p)) + { +- p = bgp_spawn(p, sk->daddr); +- p->postponed_sk = sk; +- rmove(sk, p->p.pool); +- goto leave; ++ UNLOCK_DOMAIN(rtable, bgp_listen_domain); ++ ++ /* The dynamic protocol must be in the START state */ ++ ASSERT_DIE(p->p.proto_state == PS_START); ++ birdloop_leave(p->p.loop); ++ ++ /* Now we have a clean mainloop */ ++ bgp_spawn(p, sk); ++ return 0; + } + + rmove(sk, p->p.pool); +@@ -1806,7 +1819,6 @@ bgp_start(struct proto *P) + p->incoming_conn.state = BS_IDLE; + p->neigh = NULL; + p->bfd_req = NULL; +- p->postponed_sk = NULL; + p->gr_ready = 0; + p->gr_active_num = 0; + +@@ -1848,6 +1860,16 @@ bgp_start(struct proto *P) + channel_graceful_restart_lock(&c->c); + } + ++ /* Now it's the last chance to move the postponed socket to this BGP, ++ * as bgp_start is the only hook running from main loop. */ ++ if (p->postponed_sk) ++ { ++ LOCK_DOMAIN(rtable, bgp_listen_domain); ++ rmove(p->postponed_sk, p->p.pool); ++ sk_reloop(p->postponed_sk, p->p.loop); ++ UNLOCK_DOMAIN(rtable, bgp_listen_domain); ++ } ++ + /* + * Before attempting to create the connection, we need to lock the port, + * so that we are the only instance attempting to talk with that neighbor. +@@ -1999,6 +2021,8 @@ bgp_init(struct proto_config *CF) + p->remote_ip = cf->remote_ip; + p->remote_as = cf->remote_as; + ++ p->postponed_sk = NULL; ++ + /* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */ + if (cf->c.parent) + cf->remote_ip = IPA_NONE; +diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h +index 202e78ba..dac6e84e 100644 +--- a/proto/bgp/bgp.h ++++ b/proto/bgp/bgp.h +@@ -452,7 +452,8 @@ struct bgp_ptx_private { + struct { BGP_PTX_PUBLIC; }; + struct bgp_ptx_private **locked_at; + +- pool *pool; /* Resource pool for TX related allocations */ ++ pool *pool; /* Pool for infrequent long-term blocks */ ++ stonehenge *sth; /* Bucket allocator */ + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ +@@ -461,7 +462,6 @@ struct bgp_ptx_private { + HASH(struct bgp_prefix) prefix_hash; /* Hash table of pending prefices */ + + slab *prefix_slab; /* Slab holding prefix nodes */ +- slab *bucket_slab; /* Slab holding buckets to send */ + + char bmp; /* This is a fake ptx for BMP encoding */ + }; +diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c +index f69189e0..a72c69a0 100644 +--- a/sysdep/unix/io-loop.c ++++ b/sysdep/unix/io-loop.c +@@ -1403,7 +1403,7 @@ bool task_still_in_limit(void) + { + static u64 main_counter = 0; + if (this_birdloop == &main_birdloop) +- return (++main_counter % 2048); /* This is a hack because of no accounting in mainloop */ ++ return (++main_counter % 512); /* This is a hack because of no accounting in mainloop */ + else + return ns_now() < account_last + this_thread->max_loop_time_ns; + } +diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c +index f9785c07..51395e1e 100644 +--- a/sysdep/unix/io.c ++++ b/sysdep/unix/io.c +@@ -53,14 +53,15 @@ + + /* Maximum number of calls of tx handler for one socket in one + * poll iteration. Should be small enough to not monopolize CPU by +- * one protocol instance. ++ * one protocol instance. But as most of the problems are now offloaded ++ * to worker threads, too low values may actually bring problems with ++ * latency. + */ +-#define MAX_STEPS 4 ++#define MAX_STEPS 2048 + + /* Maximum number of calls of rx handler for all sockets in one poll +- iteration. RX callbacks are often much more costly so we limit +- this to gen small latencies */ +-#define MAX_RX_STEPS 4 ++ iteration. RX callbacks are often a little bit more costly. */ ++#define MAX_RX_STEPS 512 + + + /* +@@ -2581,8 +2582,6 @@ io_init(void) + srandom((uint) (now ^ (now >> 32))); + } + +-static int short_loops = 0; +-#define SHORT_LOOP_MAX 10 + #define WORK_EVENTS_MAX 10 + + sock *stored_sock; +@@ -2670,10 +2669,9 @@ io_loop(void) + { + if (pfd.pfd.data[0].revents & POLLIN) + { +- /* IO loop reload requested */ ++ /* Somebody sent an event to mainloop */ + pipe_drain(&main_birdloop.thread->wakeup); + atomic_fetch_and_explicit(&main_birdloop.thread_transition, ~LTT_PING, memory_order_acq_rel); +- continue; + } + + times_update(); +@@ -2719,11 +2717,6 @@ io_loop(void) + main_birdloop.sock_active = sk_next(s); + } + +- short_loops++; +- if (events && (short_loops < SHORT_LOOP_MAX)) +- continue; +- short_loops = 0; +- + int count = 0; + main_birdloop.sock_active = stored_sock; + if (main_birdloop.sock_active == NULL) +diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c +index 2770b8be..1658dd6f 100644 +--- a/sysdep/unix/krt.c ++++ b/sysdep/unix/krt.c +@@ -342,6 +342,8 @@ krt_learn_async(struct krt_proto *p, rte *e, int new) + /* Hook defined in nest/rt-table.c ... to be refactored away later */ + rte *krt_export_net(struct channel *c, const net_addr *a, linpool *lp); + ++static void krt_rt_notify(struct proto *P, struct channel *ch, const net_addr *net, rte *new, const rte *old); ++ + static int + krt_same_dest(rte *k, rte *e) + { +@@ -361,6 +363,11 @@ krt_same_dest(rte *k, rte *e) + void + krt_got_route(struct krt_proto *p, rte *e, s8 src) + { ++ /* If we happen to get an asynchronous route notification ++ * before initialization, we wait for the scan. */ ++ if (p->sync_state == KPS_INIT) ++ return; ++ + rte *new = NULL; + e->pflags = 0; + +@@ -391,10 +398,6 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) + + /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ + +- /* We wait for the initial feed to have correct installed state */ +- if (!p->ready) +- goto ignore; +- + /* Get the exported version */ + new = krt_export_net(p->p.main_channel, e->net, krt_filter_lp); + +@@ -423,10 +426,6 @@ aseen: + krt_trace_in(p, e, "already seen"); + goto done; + +-ignore: +- krt_trace_in(p, e, "ignored"); +- goto done; +- + update: + krt_trace_in(p, new, "updating"); + krt_replace_rte(p, e->net, new, e); +@@ -447,12 +446,21 @@ krt_init_scan(struct krt_proto *p) + { + switch (p->sync_state) + { ++ case KPS_INIT: ++ /* Allow exports now */ ++ p->p.rt_notify = krt_rt_notify; ++ channel_start_export(p->p.main_channel); ++ rt_refresh_begin(&p->p.main_channel->in_req); ++ p->sync_state = KPS_FIRST_SCAN; ++ return 1; ++ + case KPS_IDLE: + rt_refresh_begin(&p->p.main_channel->in_req); + bmap_reset(&p->seen_map, 1024); + p->sync_state = KPS_SCANNING; + return 1; + ++ case KPS_FIRST_SCAN: + case KPS_SCANNING: + bug("Kernel scan double-init"); + +@@ -470,14 +478,17 @@ krt_prune(struct krt_proto *p) + { + switch (p->sync_state) + { ++ case KPS_INIT: + case KPS_IDLE: + bug("Kernel scan prune without scan"); + + case KPS_SCANNING: ++ channel_request_full_refeed(p->p.main_channel); ++ /* fall through */ ++ case KPS_FIRST_SCAN: + p->sync_state = KPS_PRUNING; + KRT_TRACE(p, D_EVENTS, "Pruning table %s", p->p.main_channel->table->name); + rt_refresh_end(&p->p.main_channel->in_req); +- channel_request_full_refeed(p->p.main_channel); + break; + + case KPS_PRUNING: +@@ -549,7 +560,7 @@ krt_scan_all(timer *t UNUSED) + krt_do_scan(NULL); + + WALK_LIST2(p, n, krt_proto_list, krt_node) +- if (p->sync_state == KPS_SCANNING) ++ if ((p->sync_state == KPS_SCANNING) || (p->sync_state == KPS_FIRST_SCAN)) + krt_prune(p); + } + +@@ -644,6 +655,9 @@ krt_scan_timer_kick(struct krt_proto *p) + static int + krt_preexport(struct channel *C, rte *e) + { ++ /* The export should not start before proper sync */ ++ ASSERT_DIE(SKIP_BACK(struct krt_proto, p, C->proto)->sync_state != KPS_INIT); ++ + if (e->src->owner == &C->proto->sources) + #ifdef CONFIG_SINGLE_ROUTE + return 1; +@@ -659,20 +673,11 @@ krt_preexport(struct channel *C, rte *e) + return -1; + } + +- /* Before first scan we don't touch the routes */ +- if (!SKIP_BACK(struct krt_proto, p, C->proto)->ready) +- { +- if (C->debug & D_ROUTES) +- log(L_TRACE "%s.%s not ready yet to accept route for %N", +- C->proto->name, C->name, e->net); +- return -1; +- } +- + return 0; + } + + static void +-krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, ++krt_rt_notify(struct proto *P, struct channel *ch, const net_addr *net, + rte *new, const rte *old) + { + struct krt_proto *p = (struct krt_proto *) P; +@@ -685,16 +690,30 @@ krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, + + switch (p->sync_state) + { ++ case KPS_INIT: ++ bug("Routes in init state should have been rejected by preexport."); ++ + case KPS_IDLE: + case KPS_PRUNING: + if (new && bmap_test(&p->seen_map, new->id)) +- /* Already installed and seen in the kernel dump */ ++ { ++ if (ch->debug & D_ROUTES) ++ { ++ /* Already installed and seen in the kernel dump */ ++ log(L_TRACE "%s.%s: %N already in kernel", ++ P->name, ch->name, net); ++ } + return; ++ } + + /* fall through */ ++ case KPS_FIRST_SCAN: + case KPS_SCANNING: + /* Actually replace the route */ + krt_replace_rte(p, net, new, old); ++ if (ch->debug & D_ROUTES) ++ log(L_TRACE "%s.%s: %N %s kernel", ++ P->name, ch->name, net, old ? "replaced in" : "added to"); + break; + + } +@@ -724,7 +743,6 @@ krt_reload_routes(struct channel *C, struct rt_feeding_request *rfr) + + if (KRT_CF->learn) + { +- p->reload = 1; + krt_scan_timer_kick(p); + } + +@@ -741,15 +759,18 @@ krt_export_fed(struct channel *C) + { + struct krt_proto *p = (void *) C->proto; + +- p->ready = 1; +- p->initialized = 1; +- + switch (p->sync_state) + { ++ case KPS_INIT: ++ bug("KRT export started before scan"); ++ + case KPS_IDLE: + krt_scan_timer_kick(p); + break; + ++ case KPS_FIRST_SCAN: ++ bug("KRT export done before first scan"); ++ + case KPS_SCANNING: + break; + +@@ -823,7 +844,8 @@ krt_init(struct proto_config *CF) + p->p.main_channel = proto_add_channel(&p->p, proto_cf_main_channel(CF)); + + p->p.preexport = krt_preexport; +- p->p.rt_notify = krt_rt_notify; ++ /* Not setting rt_notify here to not start exports, must wait for the first scan ++ * and then we can start exports manually */ + p->p.iface_sub.if_notify = krt_if_notify; + p->p.reload_routes = krt_reload_routes; + p->p.export_fed = krt_export_fed; +@@ -879,7 +901,7 @@ krt_shutdown(struct proto *P) + return PS_FLUSH; + + /* FIXME we should flush routes even when persist during reconfiguration */ +- if (p->initialized && !KRT_CF->persist && (P->down_code != PDC_CMD_GR_DOWN)) ++ if ((p->sync_state != KPS_INIT) && !KRT_CF->persist && (P->down_code != PDC_CMD_GR_DOWN)) + { + struct rt_export_feeder req = (struct rt_export_feeder) + { +@@ -914,8 +936,7 @@ krt_shutdown(struct proto *P) + static void + krt_cleanup(struct krt_proto *p) + { +- p->ready = 0; +- p->initialized = 0; ++ p->sync_state = KPS_INIT; + + krt_sys_shutdown(p); + rem_node(&p->krt_node); +diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h +index 394e7401..14be715f 100644 +--- a/sysdep/unix/krt.h ++++ b/sysdep/unix/krt.h +@@ -59,10 +59,9 @@ struct krt_proto { + struct bmap seen_map; /* Routes seen during last periodic scan */ + node krt_node; /* Node in krt_proto_list */ + byte af; /* Kernel address family (AF_*) */ +- byte ready; /* Initial feed has been finished */ +- byte initialized; /* First scan has been finished */ +- byte reload; /* Next scan is doing reload */ + PACKED enum krt_prune_state { ++ KPS_INIT, ++ KPS_FIRST_SCAN, + KPS_IDLE, + KPS_SCANNING, + KPS_PRUNING,