Skip to content

Commit e12b6b7

Browse files
committed
Merge pull request #339 from gitoleg/bap-bytes-PR
Bap_bytes module
2 parents 9b55e97 + ccc7729 commit e12b6b7

13 files changed

+555
-0
lines changed

_oasis

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Library types
103103
Bap_bil_adt,
104104
Bap_biri,
105105
Bap_bitvector,
106+
Bap_bytes,
106107
Bap_common,
107108
Bap_config,
108109
Bap_context,
@@ -324,6 +325,7 @@ Library types_test
324325
BuildDepends: bap, oUnit
325326
Modules: Test_bitvector,
326327
Test_bili,
328+
Test_bytes,
327329
Test_graph,
328330
Test_trie
329331

lib/bap/bap.mli

Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,309 @@ module Std : sig
608608
val pkg_version : string
609609
end
610610

611+
module Bytes : sig
612+
613+
type t = Bytes.t with bin_io, compare, sexp
614+
615+
include Container.S0 with type t := t with type elt := char
616+
include Blit.S with type t := t
617+
include Identifiable.S with type t := t
618+
module To_string : Blit.S_distinct with type src := t with type dst := string
619+
module From_string : Blit.S_distinct with type src := string with type dst := t
620+
621+
(** [create n] returns a new byte sequence of length [n]. The
622+
sequence is uninitialized and contains arbitrary bytes.
623+
Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
624+
external create : int -> t = "caml_create_string"
625+
626+
(** [make n c] returns a new byte sequence of length [n], filled with
627+
the byte [c].
628+
Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
629+
val make : int -> char -> t
630+
631+
(** [init n ~f] returns a fresh byte sequence of length [n], with
632+
character [i] initialized to the result of [f i] (in increasing
633+
index order).
634+
Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
635+
val init : int -> f:(int -> char) -> t
636+
637+
(** [empty] a byte sequence of size 0. *)
638+
val empty : t
639+
640+
(** [length t] returns the length (number of bytes) of [t]. *)
641+
external length: t -> int = "%string_length"
642+
643+
(** [get s n] returns the byte at index [n] in [s].
644+
Raise [Invalid_argument] if [n] not a valid index in [s]. *)
645+
external get : t -> int -> char = "%string_safe_get"
646+
647+
(** [set s n c] modifies [s] in place, replacing the byte at index [n]
648+
with [c].
649+
Raise [Invalid_argument] if [n] is not a valid index in [s]. *)
650+
external set : t -> int -> char -> unit = "%string_safe_set"
651+
652+
(** [copy t] returns a new byte sequence that contains the same
653+
bytes as [t]. *)
654+
val copy : t -> t
655+
656+
(** [of_string s] returns a new byte sequence that contains the
657+
same bytes as the given string. *)
658+
val of_string : string -> t
659+
660+
(** [to_string t] returns a new string that contains the same
661+
bytes as the given byte sequence. *)
662+
val to_string : t -> string
663+
664+
(** [extend s left right] returns a new byte sequence that contains
665+
the bytes of [s], with [left] uninitialized bytes prepended and
666+
[right] uninitialized bytes appended to it. If [left] or [right]
667+
is negative, then bytes are removed (instead of appended) from
668+
the corresponding side of [s].
669+
Raise [Invalid_argument] if the result length is negative or
670+
longer than {!Sys.max_string_length} bytes. *)
671+
val extend : t -> int -> int -> t
672+
673+
(** [fill s start len c] modifies [s] in place, replacing [len]
674+
characters with [c], starting at [start].
675+
Raise [Invalid_argument] if [start] and [len] do not designate a
676+
valid range of [s]. *)
677+
val fill : t -> int -> int -> char -> unit
678+
679+
(** [concat sep sl] concatenates the list of byte sequences [sl],
680+
inserting the separator byte sequence [sep] between each, and
681+
returns the result as a new byte sequence.
682+
Raise [Invalid_argument] if the result is longer than
683+
{!Sys.max_string_length} bytes. *)
684+
val concat : t -> t list -> t
685+
686+
(** [cat s1 s2] concatenates [s1] and [s2] and returns the result
687+
as new byte sequence.
688+
Raise [Invalid_argument] if the result is longer than
689+
{!Sys.max_string_length} bytes. *)
690+
val cat : t -> t -> t
691+
692+
(** [iteri t ~f] same as {!iter}, but the function is
693+
applied to the index of the byte as first argument and the
694+
byte itself as second argument. *)
695+
val iteri : t -> f:(int -> char -> unit) -> unit
696+
697+
(** [map s ~f] applies function [f] in turn to all the bytes of [s]
698+
(in increasing index order) and stores the resulting bytes in
699+
a new sequence that is returned as the result. *)
700+
val map : t -> f:(char -> char) -> t
701+
702+
(** [mapi s ~f] calls [f] with each character of [s] and its
703+
index (in increasing index order) and stores the resulting bytes
704+
in a new sequence that is returned as the result. *)
705+
val mapi : t -> f:(int -> char -> char) -> t
706+
707+
(** [trim t] returns a copy of [t], without leading and trailing
708+
whitespace. The bytes regarded as whitespace are the ASCII
709+
characters [' '], ['\012'], ['\n'], ['\r'], and ['\t']. *)
710+
val trim : t -> t
711+
712+
(** [escaped t] returns a copy of [t], with special characters
713+
represented by escape sequences, following the lexical
714+
conventions of OCaml.
715+
Raise [Invalid_argument] if the result is longer than
716+
{!Sys.max_string_length} bytes. *)
717+
val escaped : t -> t
718+
719+
(** [index s c] returns the index of the first occurrence of byte [c]
720+
in [s].
721+
Raise [Not_found] if [c] does not occur in [s]. *)
722+
val index : t -> char -> int
723+
724+
(** [rindex s c] returns the index of the last occurrence of byte [c]
725+
in [s].
726+
Raise [Not_found] if [c] does not occur in [s]. *)
727+
val rindex : t -> char -> int
728+
729+
(** [index_from s i c] returns the index of the first occurrence of
730+
byte [c] in [s] after position [i]. [index s c] is
731+
equivalent to [index_from s 0 c].
732+
Raise [Invalid_argument] if [i] is not a valid position in [s].
733+
Raise [Not_found] if [c] does not occur in [s] after position [i]. *)
734+
val index_from : t -> int -> char -> int
735+
736+
(** [rindex_from s i c] returns the index of the last occurrence of
737+
byte [c] in [s] before position [i+1]. [rindex s c] is equivalent
738+
to [rindex_from s (length s - 1) c].
739+
Raise [Invalid_argument] if [i+1] is not a valid position in [s].
740+
Raise [Not_found] if [c] does not occur in [s] before position [i+1]. *)
741+
val rindex_from : t -> int -> char -> int
742+
743+
(** [contains s c] tests if byte [c] appears in [s]. *)
744+
val contains : t -> char -> bool
745+
746+
(** [contains_from s start c] tests if byte [c] appears in [s] after
747+
position [start]. [contains s c] is equivalent to [contains_from
748+
s 0 c].
749+
Raise [Invalid_argument] if [start] is not a valid position in [s]. *)
750+
val contains_from : t -> int -> char -> bool
751+
752+
(** [rcontains_from s stop c] tests if byte [c] appears in [s] before
753+
position [stop+1].
754+
Raise [Invalid_argument] if [stop < 0] or [stop+1] is not a valid
755+
position in [s]. *)
756+
val rcontains_from : t -> int -> char -> bool
757+
758+
(** [uppercase t] returns a copy of [t], with all lowercase letters
759+
translated to uppercase, including accented letters of the ISO
760+
Latin-1 (8859-1) character set. *)
761+
val uppercase : t -> t
762+
763+
(** [lowercase t] returns a copy of [t], with all uppercase letters
764+
translated to lowercase, including accented letters of the ISO
765+
Latin-1 (8859-1) character set. *)
766+
val lowercase : t -> t
767+
768+
(** [capitalize t] returns a copy of [t], with the first byte set
769+
to uppercase. *)
770+
val capitalize : t -> t
771+
772+
(** [uncapitalize t] returns a copy of [t], with the first byte set
773+
to lowercase. *)
774+
val uncapitalize : t -> t
775+
776+
(** {4 Unsafe conversions (for advanced users)}
777+
778+
This section describes unsafe, low-level conversion functions
779+
between [bytes] and [string]. They do not copy the internal data;
780+
used improperly, they can break the immutability invariant on
781+
strings provided by the [-safe-string] option. They are available for
782+
expert library authors, but for most purposes you should use the
783+
always-correct {!Bytes.to_string} and {!Bytes.of_string} instead. *)
784+
module Unsafe : sig
785+
786+
(** [to_string b] - unsafely converts a byte sequence into a string.
787+
788+
To reason about the use of [to_string], it is convenient to
789+
consider an "ownership" discipline. A piece of code that
790+
manipulates some data "owns" it; there are several disjoint ownership
791+
modes, including:
792+
- Unique ownership: the data may be accessed and mutated
793+
- Shared ownership: the data has several owners, that may only
794+
access it, not mutate it.
795+
796+
Unique ownership is linear: passing the data to another piece of
797+
code means giving up ownership (we cannot write the
798+
data again). A unique owner may decide to make the data shared
799+
(giving up mutation rights on it), but shared data may not become
800+
uniquely-owned again.
801+
802+
[to_string s] can only be used when the caller owns the byte
803+
sequence [s] -- either uniquely or as shared immutable data. The
804+
caller gives up ownership of [s], and gains ownership of the
805+
returned string.
806+
807+
There are two valid use-cases that respect this ownership
808+
discipline:
809+
810+
1. Creating a string by initializing and mutating a byte sequence
811+
that is never changed after initialization is performed.
812+
813+
{[
814+
let string_init len f : string =
815+
let s = Bytes.create len in
816+
for i = 0 to len - 1 do Bytes.set s i (f i) done;
817+
Bytes.Unsafe.to_string s
818+
]}
819+
820+
This function is safe because the byte sequence [s] will never be
821+
accessed or mutated after [to_string] is called. The
822+
[string_init] code gives up ownership of [s], and returns the
823+
ownership of the resulting string to its caller.
824+
825+
Note that it would be unsafe if [s] was passed as an additional
826+
parameter to the function [f] as it could escape this way and be
827+
mutated in the future -- [string_init] would give up ownership of
828+
[s] to pass it to [f], and could not call [to_string]
829+
safely.
830+
831+
We have provided the {!String.init}, {!String.map} and
832+
{!String.mapi} functions to cover most cases of building
833+
new strings. You should prefer those over [to_string] or
834+
[to_string] whenever applicable.
835+
836+
2. Temporarily giving ownership of a byte sequence to a function
837+
that expects a uniquely owned string and returns ownership back, so
838+
that we can mutate the sequence again after the call ended.
839+
840+
{[
841+
let bytes_length (s : bytes) =
842+
String.length (Bytes.Unsafe.to_string s)
843+
]}
844+
845+
In this use-case, we do not promise that [s] will never be mutated
846+
after the call to [bytes_length s]. The {!String.length} function
847+
temporarily borrows unique ownership of the byte sequence
848+
(and sees it as a [string]), but returns this ownership back to
849+
the caller, which may assume that [s] is still a valid byte
850+
sequence after the call. Note that this is only correct because we
851+
know that {!String.length} does not capture its argument -- it could
852+
escape by a side-channel such as a memoization combinator.
853+
854+
The caller may not mutate [s] while the string is borrowed (it has
855+
temporarily given up ownership). This affects concurrent programs,
856+
but also higher-order functions: if [String.length] returned
857+
a closure to be called later, [s] should not be mutated until this
858+
closure is fully applied and returns ownership. *)
859+
val to_string : t -> string
860+
861+
(** [of_string s] - unsafely converts a shared string to a byte
862+
sequence that should not be mutated.
863+
864+
The same ownership discipline that makes [to_string]
865+
correct applies to [of_string]: you may use it if you were
866+
the owner of the [string] value, and you will own the return
867+
[bytes] in the same mode.
868+
869+
In practice, unique ownership of string values is extremely
870+
difficult to reason about correctly. You should always assume
871+
strings are shared, never uniquely owned.
872+
873+
For example, string literals are implicitly shared by the
874+
compiler, so you never uniquely own them.
875+
876+
{[
877+
let incorrect = Bytes.Unsafe.of_string "hello"
878+
let s = Bytes.of_string "hello"
879+
]}
880+
881+
The first declaration is incorrect, because the string literal
882+
["hello"] could be shared by the compiler with other parts of the
883+
program, and mutating [incorrect] is a bug. You must always use
884+
the second version, which performs a copy and is thus correct.
885+
886+
Assuming unique ownership of strings that are not string
887+
literals, but are (partly) built from string literals, is also
888+
incorrect. For example, mutating [of_string ("foo" ^ s)]
889+
could mutate the shared string ["foo"] -- assuming a rope-like
890+
representation of strings. More generally, functions operating on
891+
strings will assume shared ownership, they do not preserve unique
892+
ownership. It is thus incorrect to assume unique ownership of the
893+
result of [of_string].
894+
895+
The only case we have reasonable confidence is safe is if the
896+
produced [bytes] is shared -- used as an immutable byte
897+
sequence. This is possibly useful for incremental migration of
898+
low-level programs that manipulate immutable sequences of bytes
899+
(for example {!Marshal.from_bytes}) and previously used the
900+
[string] type for this purpose. *)
901+
val of_string : string -> t
902+
903+
(** The following is for system use only. Do not call directly. *)
904+
external get : t -> int -> char = "%string_unsafe_get"
905+
external set : t -> int -> char -> unit = "%string_unsafe_set"
906+
external blit : t -> int -> t -> int -> int -> unit = "caml_blit_string" "noalloc"
907+
external fill : t -> int -> int -> char -> unit = "caml_fill_string" "noalloc"
908+
909+
end
910+
end
911+
912+
type bytes = Bytes.t
913+
611914
type 'a reader
612915
type 'a writer
613916

0 commit comments

Comments
 (0)