Skip to content

Commit ad546ed

Browse files
authored
Mas i994 handoffsync (#995)
* Initial tidy Remove legacy code from handoff_sender. The handoff_receiver must keep unused references, as during an upgrade an updated node could be a receiver to a non-updated node sending. For non-batch not such an issue, as all supported versions for upgrade already supported batch. However, receiver must still indicate it supports batch due to above problem. All handoff receiver/sender code tidied down to 80-column width * Change AckSync to every batch by default Do away with timer based sync, and ack-sync based on threshold only. Also log ongoing transfer progress every ack-log threshold. Log at point of error reason for error - avoid generic {shutdown, timeout} error with no clue as to actual point of code origin. * Standardise send_sync into function Fix issue that configure message does not respond sync, and so another sync is now required. * Receiver needs vnode module not master * Make first log indicate batch_count of 0 Confusing that the first log on the sending of a batch will indicate a batch_count of 100 not 0 * Batch threshold can be either count or size Clarify log text, as batch_size no longer fixed * Reinstate keepalive of receiver Need to distinguish between failed fold and slow fold - and so the keepalive of the receiver has value. Now implemented by checking a keepalive_next time every visit item, rather than continuously entering and exiting selective receive * Further comments * Update after review * Metadata exchange on join Make a metadata exchange part of the join process. This prevents the situation where a bucket type is active in a cluster, then a node joins (as part of cluster expansion, say), but the bucket properties are not known to joining node during handoff of objects of that type. Now, the join cannot be staged without a metadata exchange, so that all joining nodes know of cluster metadata (e.g. bucket types) before the join is committed and handoffs start. * Attempt exchange on Join It is only an attempt - failure (i.e. timeout) would be no different to the current state with a potential race, so we don't block joins. Joins will normally be safer because of this.
1 parent 5da9f75 commit ad546ed

File tree

5 files changed

+568
-415
lines changed

5 files changed

+568
-415
lines changed

src/riak_core.erl

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -140,25 +140,28 @@ standard_join(Node, Ring, Rejoin, Auto) ->
140140
{error, different_ring_sizes};
141141
_ ->
142142
GossipVsn = riak_core_gossip:gossip_version(),
143-
Ring2 = riak_core_ring:add_member(node(), Ring,
144-
node()),
143+
Ring2 =
144+
riak_core_ring:add_member(
145+
node(), Ring, node()),
145146
Ring3 = riak_core_ring:set_owner(Ring2, node()),
146147
Ring4 =
147-
riak_core_ring:update_member_meta(node(),
148-
Ring3,
149-
node(),
150-
gossip_vsn,
151-
GossipVsn),
152-
ParticipateInCoverage = app_helper:get_env(riak_core,participate_in_coverage),
148+
riak_core_ring:update_member_meta(
149+
node(), Ring3, node(), gossip_vsn, GossipVsn),
150+
ParticipateInCoverage =
151+
app_helper:get_env(riak_core,participate_in_coverage),
153152
Ring4a =
154-
riak_core_ring:update_member_meta(node(),
155-
Ring4,
156-
node(),
157-
participate_in_coverage, ParticipateInCoverage),
153+
riak_core_ring:update_member_meta(
154+
node(),
155+
Ring4,
156+
node(),
157+
participate_in_coverage,
158+
ParticipateInCoverage),
158159
{_, Ring5} = riak_core_capability:update_ring(Ring4a),
159160
Ring6 = maybe_auto_join(Auto, node(), Ring5),
160161
riak_core_ring_manager:set_my_ring(Ring6),
161-
riak_core_gossip:send_ring(Node, node())
162+
ok = riak_core_gossip:send_ring(Node, node()),
163+
ok = riak_core_metadata_manager:attempt_exchange(Node)
164+
162165
end.
163166

164167
maybe_auto_join(false, _Node, Ring) ->

src/riak_core_handoff_receiver.erl

Lines changed: 111 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,24 @@
2828
[{gen_fsm, sync_send_all_state_event, 3}]}).
2929

3030
-export([start_link/0, % Don't use SSL
31-
start_link/1, % SSL options list, empty=no SSL
32-
set_socket/2,
33-
supports_batching/0]).
31+
start_link/1, % SSL options list, empty=no SSL
32+
set_socket/2,
33+
supports_batching/0,
34+
get_handoff_timeout/0]).
3435
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
3536
terminate/2, code_change/3]).
3637

37-
-record(state, {sock :: port() | undefined,
38-
peer :: term() | undefined,
39-
ssl_opts :: [] | list(),
40-
tcp_mod :: atom(),
41-
recv_timeout_len :: non_neg_integer(),
42-
vnode_timeout_len :: non_neg_integer(),
43-
partition :: non_neg_integer() | undefined,
44-
vnode_mod = riak_kv_vnode :: module(),
45-
vnode :: pid() | undefined,
46-
count = 0 :: non_neg_integer()}).
38+
-record(state,
39+
{sock :: port() | undefined,
40+
peer :: term() | undefined,
41+
ssl_opts :: [] | list(),
42+
tcp_mod :: atom(),
43+
recv_timeout_len :: non_neg_integer(),
44+
vnode_timeout_len :: non_neg_integer(),
45+
partition :: non_neg_integer() | undefined,
46+
vnode_mod = riak_kv_vnode :: module(),
47+
vnode :: pid() | undefined,
48+
count = 0 :: non_neg_integer()}).
4749

4850
%% set the TCP receive timeout to five minutes to be conservative.
4951
-define(RECV_TIMEOUT, 300000).
@@ -71,48 +73,71 @@ supports_batching() ->
7173
true.
7274

7375
init([SslOpts]) ->
74-
{ok, #state{ssl_opts = SslOpts,
75-
tcp_mod = if SslOpts /= [] -> ssl;
76-
true -> gen_tcp
77-
end,
78-
recv_timeout_len = app_helper:get_env(riak_core, handoff_receive_timeout, ?RECV_TIMEOUT),
79-
vnode_timeout_len = app_helper:get_env(riak_core, handoff_receive_vnode_timeout, ?VNODE_TIMEOUT)}}.
80-
81-
handle_call({set_socket, Socket0}, _From, State = #state{ssl_opts = SslOpts}) ->
76+
{ok,
77+
#state{
78+
ssl_opts = SslOpts,
79+
tcp_mod =
80+
if SslOpts /= [] ->
81+
ssl;
82+
true ->
83+
gen_tcp
84+
end,
85+
recv_timeout_len = get_handoff_timeout(),
86+
vnode_timeout_len =
87+
app_helper:get_env(
88+
riak_core, handoff_receive_vnode_timeout, ?VNODE_TIMEOUT)
89+
}
90+
}.
91+
92+
handle_call({set_socket, Socket0}, _From, State=#state{ssl_opts = SslOpts}) ->
8293
SockOpts = [{active, once}, {packet, 4}, {header, 1}],
83-
Socket = if SslOpts /= [] ->
84-
{ok, Skt} = ssl_handshake(Socket0, SslOpts, 30*1000),
85-
ok = ssl:setopts(Skt, SockOpts),
86-
Peer = safe_peername(Skt, ssl),
87-
Skt;
88-
true ->
89-
ok = inet:setopts(Socket0, SockOpts),
90-
Peer = safe_peername(Socket0, inet),
91-
Socket0
92-
end,
94+
Socket =
95+
if SslOpts /= [] ->
96+
{ok, Skt} = ssl_handshake(Socket0, SslOpts, 30*1000),
97+
ok = ssl:setopts(Skt, SockOpts),
98+
Peer = safe_peername(Skt, ssl),
99+
Skt;
100+
true ->
101+
ok = inet:setopts(Socket0, SockOpts),
102+
Peer = safe_peername(Socket0, inet),
103+
Socket0
104+
end,
93105
{reply, ok, State#state { sock = Socket, peer = Peer }}.
94106

95-
handle_info({tcp_closed,_Socket},State=#state{partition=Partition,count=Count,
96-
peer=Peer}) ->
97-
lager:info("Handoff receiver for partition ~p exited after processing ~p"
98-
" objects from ~p", [Partition, Count, Peer]),
107+
handle_info(
108+
{tcp_closed,_Socket},
109+
State=#state{partition=Partition, count=Count, peer=Peer}) ->
110+
lager:info(
111+
"Handoff receiver for partition ~p exited after processing ~p"
112+
" objects from ~p",
113+
[Partition, Count, Peer]),
99114
{stop, normal, State};
100-
handle_info({tcp_error, _Socket, Reason}, State=#state{partition=Partition,count=Count,
101-
peer=Peer}) ->
102-
lager:info("Handoff receiver for partition ~p exited after processing ~p"
103-
" objects from ~p: TCP error ~p", [Partition, Count, Peer, Reason]),
115+
handle_info(
116+
{tcp_error, _Socket, Reason},
117+
State=#state{partition=Partition,count=Count, peer=Peer}) ->
118+
lager:info(
119+
"Handoff receiver for partition ~p exited after processing ~p"
120+
" objects from ~p: TCP error ~p",
121+
[Partition, Count, Peer, Reason]),
104122
{stop, normal, State};
105123
handle_info({tcp, Socket, Data}, State) ->
106124
[MsgType|MsgData] = Data,
107125
case catch(process_message(MsgType, MsgData, State)) of
108126
{'EXIT', Reason} ->
109-
lager:error("Handoff receiver for partition ~p exited abnormally after "
110-
"processing ~p objects from ~p: ~p", [State#state.partition, State#state.count, State#state.peer, Reason]),
127+
lager:error(
128+
"Handoff receiver for partition ~p exited abnormally after "
129+
"processing ~p objects from ~p: ~p",
130+
[State#state.partition,
131+
State#state.count,
132+
State#state.peer, Reason]),
111133
{stop, normal, State};
112134
NewState when is_record(NewState, state) ->
113-
InetMod = if NewState#state.ssl_opts /= [] -> ssl;
114-
true -> inet
115-
end,
135+
InetMod =
136+
if NewState#state.ssl_opts /= [] ->
137+
ssl;
138+
true ->
139+
inet
140+
end,
116141
InetMod:setopts(Socket, [{active, once}]),
117142
{noreply, NewState, State#state.recv_timeout_len}
118143
end;
@@ -123,27 +148,38 @@ handle_info({ssl_error, Socket, Reason}, State) ->
123148
handle_info({ssl, Socket, Data}, State) ->
124149
handle_info({tcp, Socket, Data}, State);
125150
handle_info(timeout, State) ->
126-
lager:error("Handoff receiver for partition ~p timed out after "
127-
"processing ~p objects from ~p.", [State#state.partition, State#state.count, State#state.peer]),
151+
lager:error(
152+
"Handoff receiver for partition ~p timed out after "
153+
"processing ~p objects from ~p.",
154+
[State#state.partition, State#state.count, State#state.peer]),
128155
{stop, normal, State}.
129156

130-
process_message(?PT_MSG_INIT, MsgData, State=#state{vnode_mod=VNodeMod,
131-
peer=Peer}) ->
157+
process_message(
158+
?PT_MSG_INIT,
159+
MsgData,
160+
State=#state{vnode_mod=VNodeMod, peer=Peer}) ->
132161
<<Partition:160/integer>> = MsgData,
133-
lager:info("Receiving handoff data for partition ~p:~p from ~p", [VNodeMod, Partition, Peer]),
162+
lager:info(
163+
"Receiving handoff data for partition ~p:~p from ~p",
164+
[VNodeMod, Partition, Peer]),
134165
{ok, VNode} = riak_core_vnode_master:get_vnode_pid(Partition, VNodeMod),
135-
Data = [{mod_src_tgt, {VNodeMod, undefined, Partition}},
136-
{vnode_pid, VNode}],
166+
Data =
167+
[{mod_src_tgt, {VNodeMod, undefined, Partition}}, {vnode_pid, VNode}],
137168
riak_core_handoff_manager:set_recv_data(self(), Data),
138169
State#state{partition=Partition, vnode=VNode};
139-
140170
process_message(?PT_MSG_BATCH, MsgData, State) ->
141-
lists:foldl(fun(Obj, StateAcc) -> process_message(?PT_MSG_OBJ, Obj, StateAcc) end,
142-
State,
143-
binary_to_term(MsgData));
144-
145-
process_message(?PT_MSG_OBJ, MsgData, State=#state{vnode=VNode, count=Count,
146-
vnode_timeout_len=VNodeTimeout}) ->
171+
lists:foldl(
172+
fun(Obj, StateAcc) ->
173+
process_message(?PT_MSG_OBJ, Obj, StateAcc)
174+
end,
175+
State,
176+
binary_to_term(MsgData));
177+
process_message(
178+
?PT_MSG_OBJ,
179+
MsgData,
180+
State =
181+
#state{
182+
vnode=VNode, count=Count, vnode_timeout_len=VNodeTimeout}) ->
147183
Msg = {handoff_data, MsgData},
148184
try gen_fsm:sync_send_all_state_event(VNode, Msg, VNodeTimeout) of
149185
ok ->
@@ -152,23 +188,28 @@ process_message(?PT_MSG_OBJ, MsgData, State=#state{vnode=VNode, count=Count,
152188
exit(E)
153189
catch
154190
exit:{timeout, _} ->
155-
exit({error, {vnode_timeout, VNodeTimeout, size(MsgData),
156-
binary:part(MsgData, {0,min(size(MsgData),128)})}})
191+
exit({error, {vnode_timeout, VNodeTimeout, size(MsgData)}})
157192
end;
158-
process_message(?PT_MSG_OLDSYNC, MsgData, State=#state{sock=Socket,
159-
tcp_mod=TcpMod}) ->
193+
process_message(
194+
?PT_MSG_OLDSYNC, MsgData, State=#state{sock=Socket, tcp_mod=TcpMod}) ->
195+
% Message still required for now, as when upgrading, may have a sender in
196+
% the cluster which has not upgraded ... and so will still send OLDSYNC
160197
TcpMod:send(Socket, <<?PT_MSG_OLDSYNC:8,"sync">>),
161198
<<VNodeModBin/binary>> = MsgData,
162199
VNodeMod = binary_to_atom(VNodeModBin, utf8),
163200
State#state{vnode_mod=VNodeMod};
164-
process_message(?PT_MSG_SYNC, _MsgData, State=#state{sock=Socket,
165-
tcp_mod=TcpMod}) ->
201+
process_message(
202+
?PT_MSG_SYNC,
203+
_MsgData,
204+
State=#state{sock=Socket, tcp_mod=TcpMod}) ->
166205
TcpMod:send(Socket, <<?PT_MSG_SYNC:8, "sync">>),
167206
State;
168207
process_message(?PT_MSG_CONFIGURE, MsgData, State) ->
169208
ConfProps = binary_to_term(MsgData),
170-
State#state{vnode_mod=proplists:get_value(vnode_mod, ConfProps),
171-
partition=proplists:get_value(partition, ConfProps)};
209+
% Partition used will be over-written by ?PT_MSG_INIT
210+
State#state{
211+
vnode_mod=proplists:get_value(vnode_mod, ConfProps),
212+
partition=proplists:get_value(partition, ConfProps)};
172213
process_message(_, _MsgData, State=#state{sock=Socket,
173214
tcp_mod=TcpMod}) ->
174215
TcpMod:send(Socket, <<255:8,"unknown_msg">>),
@@ -185,5 +226,8 @@ safe_peername(Skt, Mod) ->
185226
{ok, {Host, Port}} ->
186227
{inet_parse:ntoa(Host), Port};
187228
_ ->
188-
{unknown, unknown} % Real info is {Addr, Port}
229+
{unknown, unknown} % Real info is {Addr, Port}
189230
end.
231+
232+
get_handoff_timeout() ->
233+
app_helper:get_env(riak_core, handoff_receive_timeout, ?RECV_TIMEOUT).

0 commit comments

Comments
 (0)