Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/hex_core.erl
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@
%%
%% * `docs_tarball_max_uncompressed_size' - Maximum size of uncompressed docs tarball, defaults to
%% `134_217_728' (128 MiB). Set to `infinity' to not enforce the limit.
%%
%% * `metadata_fields' - Either `all' or a list of metadata.config keys (binaries) to read.
%% When set to a list, the metadata decoder streams past unrequested fields without
%% buffering their tokens, which keeps peak memory bounded for packages with very
%% large fields like `<<"files">>'. Defaults to `all'.

-module(hex_core).
-export([default_config/0]).
Expand Down Expand Up @@ -111,7 +116,8 @@
tarball_max_size => pos_integer() | infinity,
tarball_max_uncompressed_size => pos_integer() | infinity,
docs_tarball_max_size => pos_integer() | infinity,
docs_tarball_max_uncompressed_size => pos_integer() | infinity
docs_tarball_max_uncompressed_size => pos_integer() | infinity,
metadata_fields => all | [binary()]
}.

-spec default_config() -> config().
Expand All @@ -137,5 +143,6 @@ default_config() ->
tarball_max_size => 16 * 1024 * 1024,
tarball_max_uncompressed_size => 128 * 1024 * 1024,
docs_tarball_max_size => 16 * 1024 * 1024,
docs_tarball_max_uncompressed_size => 128 * 1024 * 1024
docs_tarball_max_uncompressed_size => 128 * 1024 * 1024,
metadata_fields => all
}.
302 changes: 277 additions & 25 deletions src/hex_tarball.erl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
format_error/1
]).
-ifdef(TEST).
-export([do_decode_metadata/1, gzip/1, normalize_requirements/1]).
-export([do_decode_metadata/1, do_decode_metadata/2, gzip/1, normalize_requirements/1]).
-endif.
-define(VERSION, <<"3">>).
-define(HASH_CHUNK_SIZE, 65536).
-define(MAX_VERSION_SIZE, 32).
-define(MAX_CHECKSUM_SIZE, 128).
-define(MAX_METADATA_SIZE, 128 * 1024).
-define(MAX_METADATA_SIZE, 1024 * 1024).
-define(METADATA_CHUNK_SIZE, 4096).
-define(BUILD_TOOL_FILES, [
{<<"mix.exs">>, <<"mix">>},
{<<"rebar.config">>, <<"rebar3">>},
Expand Down Expand Up @@ -546,41 +547,292 @@ check_inner_checksum(#{files := Files} = State) ->
%% @private
decode_metadata({error, _} = Error) ->
Error;
decode_metadata(#{files := #{"metadata.config" := Binary}} = State) when is_binary(Binary) ->
case do_decode_metadata(Binary) of
decode_metadata(#{files := #{"metadata.config" := Binary}, config := Config} = State) when
is_binary(Binary)
->
Fields = maps:get(metadata_fields, Config, all),
case do_decode_metadata(Binary, Fields) of
#{} = Metadata -> maps:put(metadata, normalize_metadata(Metadata), State);
Other -> Other
end.

-ifdef(TEST).
%% @private
do_decode_metadata(Binary) when is_binary(Binary) ->
{ok, String} = characters_to_list(Binary),
do_decode_metadata(Binary) ->
do_decode_metadata(Binary, all).
-endif.

case safe_erl_term:string(String) of
{ok, Tokens, _Line} ->
try
Terms = safe_erl_term:terms(Tokens),
maps:from_list(Terms)
catch
error:function_clause ->
{error, {metadata, invalid_terms}};
error:badarg ->
{error, {metadata, not_key_value}}
%% @private
do_decode_metadata(Binary, all) when is_binary(Binary) ->
case decode_metadata_chunked(utf8, Binary, <<>>, [], "", []) of
latin1_fallback ->
decode_metadata_chunked(latin1, Binary, <<>>, [], "", []);
Other ->
Other
end;
do_decode_metadata(Binary, Fields) when is_binary(Binary), is_list(Fields) ->
case decode_metadata_streaming(utf8, Binary, <<>>, [], "", [], Fields, start) of
latin1_fallback ->
decode_metadata_streaming(latin1, Binary, <<>>, [], "", [], Fields, start);
Other ->
Other
end.

%% @private
%% Streams the metadata.config binary through safe_erl_term:tokens/2 in
%% small chunks so we never materialize the whole binary as a char list.
%% Each accepted dot-terminated form is parsed and accumulated immediately,
%% keeping peak memory at roughly one chunk + one term's tokens + AST.
decode_metadata_chunked(Encoding, Binary, IncTail, Cont, Chars, Acc) ->
case Chars of
[] when Binary =:= <<>>, IncTail =:= <<>> ->
flush_metadata_eof(Cont, Acc);
[] when Binary =:= <<>>, Encoding =:= utf8 ->
%% Trailing bytes that can never form a complete UTF-8 sequence —
%% restart the whole decode in latin1 mode rather than spin.
latin1_fallback;
[] ->
case decode_metadata_chunk(Encoding, Binary, IncTail) of
{ok, NewChars, NewBinary, NewTail} ->
feed_metadata(Encoding, Cont, NewChars, NewBinary, NewTail, Acc);
latin1_fallback ->
latin1_fallback
end;
_ ->
feed_metadata(Encoding, Cont, Chars, Binary, IncTail, Acc)
end.

%% @private
feed_metadata(Encoding, Cont, Chars, Binary, IncTail, Acc) ->
case safe_erl_term:tokens(Cont, Chars) of
{more, NewCont} ->
decode_metadata_chunked(Encoding, Binary, IncTail, NewCont, "", Acc);
{done, {ok, Tokens, _}, RestChars} ->
case parse_metadata_term(Tokens) of
{ok, Term} ->
decode_metadata_chunked(
Encoding, Binary, IncTail, [], normalize_rest_chars(RestChars), [Term | Acc]
);
{error, _} = Err ->
Err
end;
{error, {_Line, safe_erl_term, Reason}, _Line2} ->
{done, {eof, _}, _} ->
finalize_metadata(Acc);
{done, {error, {_, safe_erl_term, Reason}, _}, _} ->
{error, {metadata, Reason}}
end.

%% @private
flush_metadata_eof([], Acc) ->
finalize_metadata(Acc);
flush_metadata_eof(Cont, Acc) ->
case safe_erl_term:tokens(Cont, eof) of
{done, {eof, _}, _} ->
finalize_metadata(Acc);
{done, {ok, _Tokens, _}, _} ->
{error, {metadata, invalid_terms}};
{done, {error, {_, safe_erl_term, Reason}, _}, _} ->
{error, {metadata, Reason}}
end.

%% @private
characters_to_list(Binary) ->
case unicode:characters_to_list(Binary) of
List when is_list(List) ->
{ok, List};
finalize_metadata([]) ->
{error, {metadata, invalid_terms}};
finalize_metadata(Acc) ->
try maps:from_list(lists:reverse(Acc)) of
Map -> Map
catch
error:badarg -> {error, {metadata, not_key_value}}
end.

%% @private
parse_metadata_term(Tokens) ->
case erl_parse:parse_term(Tokens) of
{ok, Term} -> {ok, Term};
{error, _} -> {error, {metadata, invalid_terms}}
end.

%% @private
decode_metadata_chunk(utf8, Binary, IncTail) ->
{Chunk, Rest} = take_metadata_chunk(Binary),
Combined =
case IncTail of
<<>> -> Chunk;
_ -> <<IncTail/binary, Chunk/binary>>
end,
case unicode:characters_to_list(Combined, utf8) of
L when is_list(L) ->
{ok, L, Rest, <<>>};
{incomplete, L, NewTail} ->
{ok, L, Rest, NewTail};
{error, _, _} ->
case unicode:characters_to_list(Binary, latin1) of
List when is_list(List) -> {ok, List};
Other -> Other
end
latin1_fallback
end;
decode_metadata_chunk(latin1, Binary, _IncTail) ->
{Chunk, Rest} = take_metadata_chunk(Binary),
{ok, binary_to_list(Chunk), Rest, <<>>}.

%% @private
take_metadata_chunk(Binary) when byte_size(Binary) > ?METADATA_CHUNK_SIZE ->
<<Chunk:(?METADATA_CHUNK_SIZE)/binary, Rest/binary>> = Binary,
{Chunk, Rest};
take_metadata_chunk(Binary) ->
{Binary, <<>>}.

%% @private
normalize_rest_chars(eof) -> "";
normalize_rest_chars(L) when is_list(L) -> L.

%% @private
%% Streams the metadata.config binary through safe_erl_term:token/2 one token
%% at a time. Forms whose key is in Fields are accumulated and parsed; forms
%% whose key is not in Fields are discarded with only a depth counter held in
%% state, so peak memory stays bounded regardless of the unwanted form's size.
decode_metadata_streaming(Encoding, Binary, IncTail, Cont, Chars, Acc, Fields, State) ->
case Chars of
[] when Binary =:= <<>>, IncTail =:= <<>> ->
flush_metadata_streaming_eof(Cont, Acc, Fields, State);
[] when Binary =:= <<>>, Encoding =:= utf8 ->
latin1_fallback;
[] ->
case decode_metadata_chunk(Encoding, Binary, IncTail) of
{ok, NewChars, NewBinary, NewTail} ->
feed_metadata_streaming(
Encoding, Cont, NewChars, NewBinary, NewTail, Acc, Fields, State
);
latin1_fallback ->
latin1_fallback
end;
_ ->
feed_metadata_streaming(Encoding, Cont, Chars, Binary, IncTail, Acc, Fields, State)
end.

%% @private
feed_metadata_streaming(Encoding, Cont, Chars, Binary, IncTail, Acc, Fields, State) ->
case safe_erl_term:token(Cont, Chars) of
{more, NewCont} ->
decode_metadata_streaming(Encoding, Binary, IncTail, NewCont, "", Acc, Fields, State);
{done, {ok, Token, _}, RestChars} ->
case advance_metadata_state(State, Acc, Fields, Token) of
{next, NewState, NewAcc} ->
decode_metadata_streaming(
Encoding,
Binary,
IncTail,
[],
normalize_rest_chars(RestChars),
NewAcc,
Fields,
NewState
);
{error, _} = Err ->
Err
end;
{done, {eof, _}, _} ->
finalize_metadata_streaming(Acc, State);
{done, {error, {_, safe_erl_term, Reason}, _}, _} ->
{error, {metadata, Reason}}
end.

%% @private
flush_metadata_streaming_eof([], Acc, _Fields, State) ->
finalize_metadata_streaming(Acc, State);
flush_metadata_streaming_eof(Cont, Acc, Fields, State) ->
case safe_erl_term:token(Cont, eof) of
{done, {ok, Token, _}, _} ->
case advance_metadata_state(State, Acc, Fields, Token) of
{next, NewState, NewAcc} ->
flush_metadata_streaming_eof([], NewAcc, Fields, NewState);
{error, _} = Err ->
Err
end;
{done, {eof, _}, _} ->
finalize_metadata_streaming(Acc, State);
{done, {error, {_, safe_erl_term, Reason}, _}, _} ->
{error, {metadata, Reason}}
end.

%% @private
finalize_metadata_streaming(Acc, start) ->
finalize_metadata(Acc);
finalize_metadata_streaming([], between) ->
#{};
finalize_metadata_streaming(Acc, between) ->
finalize_metadata(Acc);
finalize_metadata_streaming(_Acc, _State) ->
{error, {metadata, invalid_terms}}.

%% @private
%% State machine for streaming the metadata.config schema. Forms are required
%% to be `{<<"key">>, value}.` — anything else is rejected as invalid.
%%
%% States: start | between | {after_open, Prefix} | {after_left_binary, Prefix}
%% | {after_key, KeyChars, Prefix} | {after_right_binary, KeyChars, Prefix}
%% | {accumulate, Prefix, Depth} | {skip, Depth}
%%
%% `start` is the initial position; `between` is the position after a form has
%% been completed. Distinguishing them lets empty input return the same
%% invalid_terms error as the non-streaming path while a stream that
%% successfully skipped every form returns an empty map.
advance_metadata_state(Open, Acc, _Fields, {'{', _} = T) when Open =:= start; Open =:= between ->
{next, {after_open, [T]}, Acc};
advance_metadata_state({after_open, Prefix}, Acc, _Fields, {'<<', _} = T) ->
{next, {after_left_binary, [T | Prefix]}, Acc};
advance_metadata_state({after_left_binary, Prefix}, Acc, _Fields, {string, _, KeyChars} = T) ->
{next, {after_key, KeyChars, [T | Prefix]}, Acc};
advance_metadata_state({after_key, KeyChars, Prefix}, Acc, _Fields, {'>>', _} = T) ->
{next, {after_right_binary, KeyChars, [T | Prefix]}, Acc};
advance_metadata_state({after_right_binary, KeyChars, Prefix}, Acc, Fields, {',', _} = T) ->
case extract_metadata_key(KeyChars) of
{ok, Key} ->
case lists:member(Key, Fields) of
true -> {next, {accumulate, [T | Prefix], 1}, Acc};
false -> {next, {skip, 1}, Acc}
end;
error ->
{error, {metadata, not_key_value}}
end;
advance_metadata_state({accumulate, Prefix, 0}, Acc, _Fields, {dot, _} = T) ->
Tokens = lists:reverse([T | Prefix]),
case parse_metadata_term(Tokens) of
{ok, Term} -> {next, between, [Term | Acc]};
{error, _} = Err -> Err
end;
advance_metadata_state({accumulate, _, _}, _Acc, _Fields, {dot, _}) ->
{error, {metadata, invalid_terms}};
advance_metadata_state({accumulate, Prefix, Depth}, Acc, _Fields, {Open, _} = T) when
Open =:= '{'; Open =:= '['
->
{next, {accumulate, [T | Prefix], Depth + 1}, Acc};
advance_metadata_state({accumulate, Prefix, Depth}, Acc, _Fields, {Close, _} = T) when
Close =:= '}'; Close =:= ']'
->
{next, {accumulate, [T | Prefix], Depth - 1}, Acc};
advance_metadata_state({accumulate, Prefix, Depth}, Acc, _Fields, T) ->
{next, {accumulate, [T | Prefix], Depth}, Acc};
advance_metadata_state({skip, 0}, Acc, _Fields, {dot, _}) ->
{next, between, Acc};
advance_metadata_state({skip, _}, _Acc, _Fields, {dot, _}) ->
{error, {metadata, invalid_terms}};
advance_metadata_state({skip, Depth}, Acc, _Fields, {Open, _}) when
Open =:= '{'; Open =:= '['
->
{next, {skip, Depth + 1}, Acc};
advance_metadata_state({skip, Depth}, Acc, _Fields, {Close, _}) when
Close =:= '}'; Close =:= ']'
->
{next, {skip, Depth - 1}, Acc};
advance_metadata_state({skip, Depth}, Acc, _Fields, _Token) ->
{next, {skip, Depth}, Acc};
advance_metadata_state(_State, _Acc, _Fields, _Token) ->
{error, {metadata, not_key_value}}.

%% @private
extract_metadata_key(KeyChars) ->
try list_to_binary(KeyChars) of
Key -> {ok, Key}
catch
error:badarg -> error
end.

%% @private
Expand Down
4 changes: 3 additions & 1 deletion src/safe_erl_term.xrl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ Rules.
{D}+ : {token, {integer, TokenLine, list_to_integer(TokenChars)}}.
[\#\[\]}{,+-] : {token, {list_to_atom(TokenChars), TokenLine}}.
(<<|>>|=>) : {token, {list_to_atom(TokenChars), TokenLine}}.
\. : {token, {dot, TokenLine}}.
% end_token (not token) lets hex_tarball stream-decode metadata.config
% one form at a time via safe_erl_term:tokens/2.
\. : {end_token, {dot, TokenLine}}.
/ : {token, {'/', TokenLine}}.
{WS}+ : skip_token.

Expand Down
Loading
Loading