@@ -608,6 +608,309 @@ module Std : sig
608
608
val pkg_version : string
609
609
end
610
610
611
+ module Bytes : sig
612
+
613
+ type t = Bytes .t with bin_io , compare , sexp
614
+
615
+ include Container. S0 with type t := t with type elt := char
616
+ include Blit. S with type t := t
617
+ include Identifiable. S with type t := t
618
+ module To_string : Blit .S_distinct with type src := t with type dst := string
619
+ module From_string : Blit .S_distinct with type src := string with type dst := t
620
+
621
+ (* * [create n] returns a new byte sequence of length [n]. The
622
+ sequence is uninitialized and contains arbitrary bytes.
623
+ Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
624
+ external create : int -> t = " caml_create_string"
625
+
626
+ (* * [make n c] returns a new byte sequence of length [n], filled with
627
+ the byte [c].
628
+ Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
629
+ val make : int -> char -> t
630
+
631
+ (* * [init n ~f] returns a fresh byte sequence of length [n], with
632
+ character [i] initialized to the result of [f i] (in increasing
633
+ index order).
634
+ Raise [Invalid_argument] if [n < 0] or [n > ]{!Sys.max_string_length}. *)
635
+ val init : int -> f :(int -> char ) -> t
636
+
637
+ (* * [empty] a byte sequence of size 0. *)
638
+ val empty : t
639
+
640
+ (* * [length t] returns the length (number of bytes) of [t]. *)
641
+ external length : t -> int = " %string_length"
642
+
643
+ (* * [get s n] returns the byte at index [n] in [s].
644
+ Raise [Invalid_argument] if [n] not a valid index in [s]. *)
645
+ external get : t -> int -> char = " %string_safe_get"
646
+
647
+ (* * [set s n c] modifies [s] in place, replacing the byte at index [n]
648
+ with [c].
649
+ Raise [Invalid_argument] if [n] is not a valid index in [s]. *)
650
+ external set : t -> int -> char -> unit = " %string_safe_set"
651
+
652
+ (* * [copy t] returns a new byte sequence that contains the same
653
+ bytes as [t]. *)
654
+ val copy : t -> t
655
+
656
+ (* * [of_string s] returns a new byte sequence that contains the
657
+ same bytes as the given string. *)
658
+ val of_string : string -> t
659
+
660
+ (* * [to_string t] returns a new string that contains the same
661
+ bytes as the given byte sequence. *)
662
+ val to_string : t -> string
663
+
664
+ (* * [extend s left right] returns a new byte sequence that contains
665
+ the bytes of [s], with [left] uninitialized bytes prepended and
666
+ [right] uninitialized bytes appended to it. If [left] or [right]
667
+ is negative, then bytes are removed (instead of appended) from
668
+ the corresponding side of [s].
669
+ Raise [Invalid_argument] if the result length is negative or
670
+ longer than {!Sys.max_string_length} bytes. *)
671
+ val extend : t -> int -> int -> t
672
+
673
+ (* * [fill s start len c] modifies [s] in place, replacing [len]
674
+ characters with [c], starting at [start].
675
+ Raise [Invalid_argument] if [start] and [len] do not designate a
676
+ valid range of [s]. *)
677
+ val fill : t -> int -> int -> char -> unit
678
+
679
+ (* * [concat sep sl] concatenates the list of byte sequences [sl],
680
+ inserting the separator byte sequence [sep] between each, and
681
+ returns the result as a new byte sequence.
682
+ Raise [Invalid_argument] if the result is longer than
683
+ {!Sys.max_string_length} bytes. *)
684
+ val concat : t -> t list -> t
685
+
686
+ (* * [cat s1 s2] concatenates [s1] and [s2] and returns the result
687
+ as new byte sequence.
688
+ Raise [Invalid_argument] if the result is longer than
689
+ {!Sys.max_string_length} bytes. *)
690
+ val cat : t -> t -> t
691
+
692
+ (* * [iteri t ~f] same as {!iter}, but the function is
693
+ applied to the index of the byte as first argument and the
694
+ byte itself as second argument. *)
695
+ val iteri : t -> f :(int -> char -> unit ) -> unit
696
+
697
+ (* * [map s ~f] applies function [f] in turn to all the bytes of [s]
698
+ (in increasing index order) and stores the resulting bytes in
699
+ a new sequence that is returned as the result. *)
700
+ val map : t -> f :(char -> char ) -> t
701
+
702
+ (* * [mapi s ~f] calls [f] with each character of [s] and its
703
+ index (in increasing index order) and stores the resulting bytes
704
+ in a new sequence that is returned as the result. *)
705
+ val mapi : t -> f :(int -> char -> char ) -> t
706
+
707
+ (* * [trim t] returns a copy of [t], without leading and trailing
708
+ whitespace. The bytes regarded as whitespace are the ASCII
709
+ characters [' '], ['\012'], ['\n'], ['\r'], and ['\t']. *)
710
+ val trim : t -> t
711
+
712
+ (* * [escaped t] returns a copy of [t], with special characters
713
+ represented by escape sequences, following the lexical
714
+ conventions of OCaml.
715
+ Raise [Invalid_argument] if the result is longer than
716
+ {!Sys.max_string_length} bytes. *)
717
+ val escaped : t -> t
718
+
719
+ (* * [index s c] returns the index of the first occurrence of byte [c]
720
+ in [s].
721
+ Raise [Not_found] if [c] does not occur in [s]. *)
722
+ val index : t -> char -> int
723
+
724
+ (* * [rindex s c] returns the index of the last occurrence of byte [c]
725
+ in [s].
726
+ Raise [Not_found] if [c] does not occur in [s]. *)
727
+ val rindex : t -> char -> int
728
+
729
+ (* * [index_from s i c] returns the index of the first occurrence of
730
+ byte [c] in [s] after position [i]. [index s c] is
731
+ equivalent to [index_from s 0 c].
732
+ Raise [Invalid_argument] if [i] is not a valid position in [s].
733
+ Raise [Not_found] if [c] does not occur in [s] after position [i]. *)
734
+ val index_from : t -> int -> char -> int
735
+
736
+ (* * [rindex_from s i c] returns the index of the last occurrence of
737
+ byte [c] in [s] before position [i+1]. [rindex s c] is equivalent
738
+ to [rindex_from s (length s - 1) c].
739
+ Raise [Invalid_argument] if [i+1] is not a valid position in [s].
740
+ Raise [Not_found] if [c] does not occur in [s] before position [i+1]. *)
741
+ val rindex_from : t -> int -> char -> int
742
+
743
+ (* * [contains s c] tests if byte [c] appears in [s]. *)
744
+ val contains : t -> char -> bool
745
+
746
+ (* * [contains_from s start c] tests if byte [c] appears in [s] after
747
+ position [start]. [contains s c] is equivalent to [contains_from
748
+ s 0 c].
749
+ Raise [Invalid_argument] if [start] is not a valid position in [s]. *)
750
+ val contains_from : t -> int -> char -> bool
751
+
752
+ (* * [rcontains_from s stop c] tests if byte [c] appears in [s] before
753
+ position [stop+1].
754
+ Raise [Invalid_argument] if [stop < 0] or [stop+1] is not a valid
755
+ position in [s]. *)
756
+ val rcontains_from : t -> int -> char -> bool
757
+
758
+ (* * [uppercase t] returns a copy of [t], with all lowercase letters
759
+ translated to uppercase, including accented letters of the ISO
760
+ Latin-1 (8859-1) character set. *)
761
+ val uppercase : t -> t
762
+
763
+ (* * [lowercase t] returns a copy of [t], with all uppercase letters
764
+ translated to lowercase, including accented letters of the ISO
765
+ Latin-1 (8859-1) character set. *)
766
+ val lowercase : t -> t
767
+
768
+ (* * [capitalize t] returns a copy of [t], with the first byte set
769
+ to uppercase. *)
770
+ val capitalize : t -> t
771
+
772
+ (* * [uncapitalize t] returns a copy of [t], with the first byte set
773
+ to lowercase. *)
774
+ val uncapitalize : t -> t
775
+
776
+ (* * {4 Unsafe conversions (for advanced users)}
777
+
778
+ This section describes unsafe, low-level conversion functions
779
+ between [bytes] and [string]. They do not copy the internal data;
780
+ used improperly, they can break the immutability invariant on
781
+ strings provided by the [-safe-string] option. They are available for
782
+ expert library authors, but for most purposes you should use the
783
+ always-correct {!Bytes.to_string} and {!Bytes.of_string} instead. *)
784
+ module Unsafe : sig
785
+
786
+ (* * [to_string b] - unsafely converts a byte sequence into a string.
787
+
788
+ To reason about the use of [to_string], it is convenient to
789
+ consider an "ownership" discipline. A piece of code that
790
+ manipulates some data "owns" it; there are several disjoint ownership
791
+ modes, including:
792
+ - Unique ownership: the data may be accessed and mutated
793
+ - Shared ownership: the data has several owners, that may only
794
+ access it, not mutate it.
795
+
796
+ Unique ownership is linear: passing the data to another piece of
797
+ code means giving up ownership (we cannot write the
798
+ data again). A unique owner may decide to make the data shared
799
+ (giving up mutation rights on it), but shared data may not become
800
+ uniquely-owned again.
801
+
802
+ [to_string s] can only be used when the caller owns the byte
803
+ sequence [s] -- either uniquely or as shared immutable data. The
804
+ caller gives up ownership of [s], and gains ownership of the
805
+ returned string.
806
+
807
+ There are two valid use-cases that respect this ownership
808
+ discipline:
809
+
810
+ 1. Creating a string by initializing and mutating a byte sequence
811
+ that is never changed after initialization is performed.
812
+
813
+ {[
814
+ let string_init len f : string =
815
+ let s = Bytes.create len in
816
+ for i = 0 to len - 1 do Bytes.set s i (f i) done;
817
+ Bytes.Unsafe.to_string s
818
+ ]}
819
+
820
+ This function is safe because the byte sequence [s] will never be
821
+ accessed or mutated after [to_string] is called. The
822
+ [string_init] code gives up ownership of [s], and returns the
823
+ ownership of the resulting string to its caller.
824
+
825
+ Note that it would be unsafe if [s] was passed as an additional
826
+ parameter to the function [f] as it could escape this way and be
827
+ mutated in the future -- [string_init] would give up ownership of
828
+ [s] to pass it to [f], and could not call [to_string]
829
+ safely.
830
+
831
+ We have provided the {!String.init}, {!String.map} and
832
+ {!String.mapi} functions to cover most cases of building
833
+ new strings. You should prefer those over [to_string] or
834
+ [to_string] whenever applicable.
835
+
836
+ 2. Temporarily giving ownership of a byte sequence to a function
837
+ that expects a uniquely owned string and returns ownership back, so
838
+ that we can mutate the sequence again after the call ended.
839
+
840
+ {[
841
+ let bytes_length (s : bytes) =
842
+ String.length (Bytes.Unsafe.to_string s)
843
+ ]}
844
+
845
+ In this use-case, we do not promise that [s] will never be mutated
846
+ after the call to [bytes_length s]. The {!String.length} function
847
+ temporarily borrows unique ownership of the byte sequence
848
+ (and sees it as a [string]), but returns this ownership back to
849
+ the caller, which may assume that [s] is still a valid byte
850
+ sequence after the call. Note that this is only correct because we
851
+ know that {!String.length} does not capture its argument -- it could
852
+ escape by a side-channel such as a memoization combinator.
853
+
854
+ The caller may not mutate [s] while the string is borrowed (it has
855
+ temporarily given up ownership). This affects concurrent programs,
856
+ but also higher-order functions: if [String.length] returned
857
+ a closure to be called later, [s] should not be mutated until this
858
+ closure is fully applied and returns ownership. *)
859
+ val to_string : t -> string
860
+
861
+ (* * [of_string s] - unsafely converts a shared string to a byte
862
+ sequence that should not be mutated.
863
+
864
+ The same ownership discipline that makes [to_string]
865
+ correct applies to [of_string]: you may use it if you were
866
+ the owner of the [string] value, and you will own the return
867
+ [bytes] in the same mode.
868
+
869
+ In practice, unique ownership of string values is extremely
870
+ difficult to reason about correctly. You should always assume
871
+ strings are shared, never uniquely owned.
872
+
873
+ For example, string literals are implicitly shared by the
874
+ compiler, so you never uniquely own them.
875
+
876
+ {[
877
+ let incorrect = Bytes.Unsafe.of_string "hello"
878
+ let s = Bytes.of_string "hello"
879
+ ]}
880
+
881
+ The first declaration is incorrect, because the string literal
882
+ ["hello"] could be shared by the compiler with other parts of the
883
+ program, and mutating [incorrect] is a bug. You must always use
884
+ the second version, which performs a copy and is thus correct.
885
+
886
+ Assuming unique ownership of strings that are not string
887
+ literals, but are (partly) built from string literals, is also
888
+ incorrect. For example, mutating [of_string ("foo" ^ s)]
889
+ could mutate the shared string ["foo"] -- assuming a rope-like
890
+ representation of strings. More generally, functions operating on
891
+ strings will assume shared ownership, they do not preserve unique
892
+ ownership. It is thus incorrect to assume unique ownership of the
893
+ result of [of_string].
894
+
895
+ The only case we have reasonable confidence is safe is if the
896
+ produced [bytes] is shared -- used as an immutable byte
897
+ sequence. This is possibly useful for incremental migration of
898
+ low-level programs that manipulate immutable sequences of bytes
899
+ (for example {!Marshal.from_bytes}) and previously used the
900
+ [string] type for this purpose. *)
901
+ val of_string : string -> t
902
+
903
+ (* * The following is for system use only. Do not call directly. *)
904
+ external get : t -> int -> char = " %string_unsafe_get"
905
+ external set : t -> int -> char -> unit = " %string_unsafe_set"
906
+ external blit : t -> int -> t -> int -> int -> unit = " caml_blit_string" " noalloc"
907
+ external fill : t -> int -> int -> char -> unit = " caml_fill_string" " noalloc"
908
+
909
+ end
910
+ end
911
+
912
+ type bytes = Bytes .t
913
+
611
914
type 'a reader
612
915
type 'a writer
613
916
0 commit comments