Skip to content

Commit f9c1679

Browse files
committed
[GR-19220] Improve Ruby string interning (#3216)
PullRequest: truffleruby/3948
2 parents c1d8453 + fd03b3e commit f9c1679

File tree

10 files changed

+155
-34
lines changed

10 files changed

+155
-34
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ New features:
77
Bug fixes:
88

99
* Fix `Dir.glob` returning blank string entry with leading `**/` in glob and `base:` argument (@rwstauner).
10-
* Fix class lookup after an object's class has been replaced by `IO#reopen` (@itarato, @eregon).
10+
* Fix class lookup after an object's class has been replaced by `IO#reopen` (@itarato, @nirvdrum, @eregon).
1111
* Fix `Marshal.load` and raise `ArgumentError` when dump is broken and is too short (#3108, @andrykonchin).
1212
* Fix `super` method lookup for unbounded attached methods (#3131, @itarato).
1313
* Fix `Module#define_method(name, Method)` to respect `module_function` visibility (#3181, @andrykonchin).
@@ -44,6 +44,7 @@ Compatibility:
4444
Performance:
4545

4646
* Improve `Truffle::FeatureLoader.loaded_feature_path` by removing expensive string ops from a loop. Speeds up feature lookup time (#3010, @itarato).
47+
* Improve `String#-@` performance by reducing unnecessary data copying and supporting substring lookups (@nirvdrum)
4748

4849
Changes:
4950

doc/user/truffleruby-additions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ TruffleRuby provides these non-standard methods and classes that provide additio
6060

6161
### Concurrent Maps
6262

63-
`TruffleRuby::ConcurrentMap` is a key-value data structure, like a `Hash` and using `#hash` and `#eql?` to compare keys and identity to compare values. Unlike `Hash` it is unordered. All methods on `TruffleRuby::ConcurrentMap` are thread-safe but should have higher concurrency than a fully syncronized implementation. It is intended to be used by gems such as [`concurrent-ruby`](https://github.com/ruby-concurrency/concurrent-ruby) - please use via this gem rather than using directly.
63+
`TruffleRuby::ConcurrentMap` is a key-value data structure, like a `Hash` and using `#hash` and `#eql?` to compare keys and identity to compare values. Unlike `Hash` it is unordered. All methods on `TruffleRuby::ConcurrentMap` are thread-safe but should have higher concurrency than a fully synchronized implementation. It is intended to be used by gems such as [`concurrent-ruby`](https://github.com/ruby-concurrency/concurrent-ruby) - please use via this gem rather than using directly.
6464

6565
* `map = TruffleRuby::ConcurrentMap.new([initial_capacity: ...], [load_factor: ...])`
6666

spec/ruby/core/io/new_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
require_relative '../../spec_helper'
22
require_relative 'shared/new'
33

4-
# NOTE: should be syncronized with library/stringio/initialize_spec.rb
4+
# NOTE: should be synchronized with library/stringio/initialize_spec.rb
55

66
describe "IO.new" do
77
it_behaves_like :io_new, :new

spec/ruby/core/io/shared/new.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
require_relative '../fixtures/classes'
22

3-
# NOTE: should be syncronized with library/stringio/initialize_spec.rb
3+
# NOTE: should be synchronized with library/stringio/initialize_spec.rb
44

55
# This group of specs may ONLY contain specs that do successfully create
66
# an IO instance from the file descriptor returned by #new_fd helper.

src/main/java/org/truffleruby/RubyLanguage.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import com.oracle.truffle.api.source.Source;
3434
import com.oracle.truffle.api.source.SourceSection;
3535
import com.oracle.truffle.api.strings.AbstractTruffleString;
36+
import com.oracle.truffle.api.strings.InternalByteArray;
3637
import com.oracle.truffle.api.strings.TruffleString;
3738
import org.graalvm.options.OptionDescriptors;
3839
import org.truffleruby.annotations.SuppressFBWarnings;
@@ -788,6 +789,11 @@ public ImmutableRubyString getFrozenStringLiteral(TruffleString tstring, RubyEnc
788789
return frozenStringLiterals.getFrozenStringLiteral(tstring, encoding);
789790
}
790791

792+
public ImmutableRubyString getFrozenStringLiteral(InternalByteArray byteArray, boolean isImmutable,
793+
RubyEncoding encoding) {
794+
return frozenStringLiterals.getFrozenStringLiteral(byteArray, isImmutable, encoding);
795+
}
796+
791797
public long getNextObjectID() {
792798
final long id = nextObjectID.getAndAdd(ObjectSpaceManager.OBJECT_ID_INCREMENT_BY);
793799

src/main/java/org/truffleruby/core/encoding/TStringUtils.java

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ public static TruffleString.Encoding jcodingToTEncoding(Encoding jcoding) {
4444
}
4545

4646
public static TruffleString fromByteArray(byte[] bytes, TruffleString.Encoding tencoding) {
47+
return fromByteArray(bytes, 0, bytes.length, tencoding);
48+
}
49+
50+
public static TruffleString fromByteArray(byte[] bytes, int offset, int length, TruffleString.Encoding tencoding) {
4751
CompilerAsserts.neverPartOfCompilation(
4852
"Use createString(TruffleString.FromByteArrayNode, byte[], RubyEncoding) instead");
49-
return TruffleString.fromByteArrayUncached(bytes, 0, bytes.length, tencoding, false);
53+
return TruffleString.fromByteArrayUncached(bytes, offset, length, tencoding, false);
5054
}
5155

5256
public static TruffleString fromByteArray(byte[] bytes, RubyEncoding rubyEncoding) {
@@ -75,8 +79,7 @@ public static TruffleString fromJavaString(String javaString, RubyEncoding encod
7579
public static byte[] getBytesOrCopy(AbstractTruffleString tstring, RubyEncoding encoding) {
7680
CompilerAsserts.neverPartOfCompilation("uncached");
7781
var bytes = tstring.getInternalByteArrayUncached(encoding.tencoding);
78-
if (tstring instanceof TruffleString && bytes.getOffset() == 0 &&
79-
bytes.getLength() == bytes.getArray().length) {
82+
if (tstring.isImmutable() && bytes.getOffset() == 0 && bytes.getLength() == bytes.getArray().length) {
8083
return bytes.getArray();
8184
} else {
8285
return ArrayUtils.extractRange(bytes.getArray(), bytes.getOffset(), bytes.getEnd());
@@ -88,8 +91,8 @@ public static byte[] getBytesOrCopy(Node node, AbstractTruffleString tstring, Tr
8891
TruffleString.GetInternalByteArrayNode getInternalByteArrayNode,
8992
InlinedConditionProfile noCopyProfile) {
9093
var bytes = getInternalByteArrayNode.execute(tstring, encoding);
91-
if (noCopyProfile.profile(node, tstring instanceof TruffleString && bytes.getOffset() == 0 &&
92-
bytes.getLength() == bytes.getArray().length)) {
94+
if (noCopyProfile.profile(node,
95+
tstring.isImmutable() && bytes.getOffset() == 0 && bytes.getLength() == bytes.getArray().length)) {
9396
return bytes.getArray();
9497
} else {
9598
return ArrayUtils.extractRange(bytes.getArray(), bytes.getOffset(), bytes.getEnd());
@@ -149,4 +152,10 @@ public static String toJavaStringOrThrow(AbstractTruffleString tstring, RubyEnco
149152
return tstring.toJavaStringUncached();
150153
}
151154
}
155+
156+
public static boolean hasImmutableInternalByteArray(AbstractTruffleString string) {
157+
// Immutable strings trivially have immutable byte arrays.
158+
// Native strings also have immutable byte arrays because we need to copy the data into Java.
159+
return string.isImmutable() || string.isNative();
160+
}
152161
}

src/main/java/org/truffleruby/core/string/FrozenStringLiterals.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import com.oracle.truffle.api.CompilerDirectives;
1313
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
14+
import com.oracle.truffle.api.strings.InternalByteArray;
1415
import com.oracle.truffle.api.strings.TruffleString;
1516
import org.truffleruby.collections.WeakValueCache;
1617
import org.truffleruby.core.encoding.RubyEncoding;
@@ -37,25 +38,23 @@ public FrozenStringLiterals(TStringCache tStringCache) {
3738

3839
@TruffleBoundary
3940
public ImmutableRubyString getFrozenStringLiteral(TruffleString tstring, RubyEncoding encoding) {
40-
if (tstring.isNative()) {
41-
throw CompilerDirectives.shouldNotReachHere();
42-
}
43-
44-
return getFrozenStringLiteral(TStringUtils.getBytesOrCopy(tstring, encoding), encoding);
41+
return getFrozenStringLiteral(tstring.getInternalByteArrayUncached(encoding.tencoding),
42+
TStringUtils.hasImmutableInternalByteArray(tstring),
43+
encoding);
4544
}
4645

4746
@TruffleBoundary
48-
public ImmutableRubyString getFrozenStringLiteral(byte[] bytes, RubyEncoding encoding) {
47+
public ImmutableRubyString getFrozenStringLiteral(InternalByteArray byteArray, boolean isImmutable,
48+
RubyEncoding encoding) {
4949
// Ensure all ImmutableRubyString have a TruffleString from the TStringCache
50-
var cachedTString = tstringCache.getTString(bytes, encoding);
50+
var cachedTString = tstringCache.getTString(byteArray, isImmutable, encoding);
5151
var tstringWithEncoding = new TStringWithEncoding(cachedTString, encoding);
5252

5353
final ImmutableRubyString string = values.get(tstringWithEncoding);
5454
if (string != null) {
5555
return string;
5656
} else {
57-
return values.addInCacheIfAbsent(tstringWithEncoding,
58-
new ImmutableRubyString(cachedTString, encoding));
57+
return values.addInCacheIfAbsent(tstringWithEncoding, new ImmutableRubyString(cachedTString, encoding));
5958
}
6059
}
6160

src/main/java/org/truffleruby/core/string/StringNodes.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4357,10 +4357,11 @@ public abstract static class InternNode extends PrimitiveArrayArgumentsNode {
43574357
@Specialization
43584358
protected ImmutableRubyString internString(RubyString string,
43594359
@Cached RubyStringLibrary libString,
4360-
@Cached TruffleString.AsManagedNode asManagedNode) {
4360+
@Cached TruffleString.GetInternalByteArrayNode getInternalByteArrayNode) {
43614361
var encoding = libString.getEncoding(string);
4362-
TruffleString immutableManagedString = asManagedNode.execute(string.tstring, encoding.tencoding);
4363-
return getLanguage().getFrozenStringLiteral(immutableManagedString, encoding);
4362+
var byteArray = getInternalByteArrayNode.execute(string.tstring, encoding.tencoding);
4363+
return getLanguage().getFrozenStringLiteral(byteArray,
4364+
TStringUtils.hasImmutableInternalByteArray(string.tstring), encoding);
43644365
}
43654366
}
43664367

src/main/java/org/truffleruby/core/string/TBytesKey.java

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,49 @@
1212
import java.util.Arrays;
1313
import java.util.Objects;
1414

15+
import com.oracle.truffle.api.strings.InternalByteArray;
1516
import com.oracle.truffle.api.strings.TruffleString;
17+
import org.truffleruby.core.array.ArrayUtils;
1618
import org.truffleruby.core.encoding.RubyEncoding;
19+
import org.truffleruby.core.encoding.TStringUtils;
1720

1821
public final class TBytesKey {
1922

2023
private final byte[] bytes;
24+
private final int offset;
25+
private final int length;
2126
private RubyEncoding encoding;
2227
private final int bytesHashCode;
2328

24-
public TBytesKey(byte[] bytes, RubyEncoding encoding) {
29+
public TBytesKey(
30+
byte[] bytes,
31+
int offset,
32+
int length,
33+
int bytesHashCode,
34+
RubyEncoding encoding) {
2535
this.bytes = bytes;
36+
this.offset = offset;
37+
this.length = length;
38+
this.bytesHashCode = bytesHashCode;
2639
this.encoding = encoding;
27-
this.bytesHashCode = Arrays.hashCode(bytes);
40+
}
41+
42+
public TBytesKey(byte[] bytes, RubyEncoding encoding) {
43+
this(bytes, 0, bytes.length, Arrays.hashCode(bytes), encoding);
44+
}
45+
46+
/** Supports the creation of a cache key using a subset of bytes. This key *must* be used for lookups only. If you
47+
* want to insert into the cache, you *must* use the result of {@link #makeCacheable(boolean)}.
48+
*
49+
* @param byteArray A byte array retrieved from a {@link TruffleString}
50+
* @param encoding The Ruby encoding object needed to properly decode the associated byte array */
51+
public TBytesKey(InternalByteArray byteArray, RubyEncoding encoding) {
52+
this(
53+
byteArray.getArray(),
54+
byteArray.getOffset(),
55+
byteArray.getLength(),
56+
hashCode(byteArray),
57+
encoding);
2858
}
2959

3060
@Override
@@ -37,15 +67,15 @@ public boolean equals(Object o) {
3767
if (o instanceof TBytesKey) {
3868
final TBytesKey other = (TBytesKey) o;
3969
if (encoding == null) {
40-
if (Arrays.equals(bytes, other.bytes)) {
70+
if (equalBytes(this, other)) {
4171
// For getMatchedEncoding()
4272
this.encoding = Objects.requireNonNull(other.encoding);
4373
return true;
4474
} else {
4575
return false;
4676
}
4777
} else {
48-
return encoding == other.encoding && Arrays.equals(bytes, other.bytes);
78+
return encoding == other.encoding && equalBytes(this, other);
4979
}
5080
}
5181

@@ -62,4 +92,57 @@ public String toString() {
6292
return TruffleString.fromByteArrayUncached(bytes, encoding, false).toString();
6393
}
6494

95+
private static int hashCode(InternalByteArray byteArray) {
96+
return hashCode(byteArray.getArray(), byteArray.getOffset(), byteArray.getLength());
97+
}
98+
99+
/** A variant of {@link Arrays#hashCode(byte[])} that allows for selecting a range within the array. */
100+
private static int hashCode(byte[] bytes, int offset, int length) {
101+
int result = 1;
102+
for (int i = offset; i < offset + length; i++) {
103+
result = 31 * result + bytes[i];
104+
}
105+
106+
return result;
107+
}
108+
109+
private boolean equalBytes(TBytesKey a, TBytesKey b) {
110+
return Arrays.equals(a.bytes, a.offset, a.offset + a.length, b.bytes, b.offset, b.offset + b.length);
111+
}
112+
113+
private boolean isPerfectFit() {
114+
return offset == 0 && length == bytes.length;
115+
}
116+
117+
/** Returns a cache key suitable for insertion into the string cache. It's quite common that we want to cache a
118+
* substring. Since we don't want to retain the entire original string, we resolve the substring by making a copy of
119+
* the byte range that we need. However, that is a costly operation and that work is discarded in the event of a
120+
* cache hit. To avoid incurring that cost unnecessarily, we allow cache keys to refer to a subset of a byte array.
121+
* While that saves computation during a cache lookup, it means such keys are unsuitable for insertion into the
122+
* cache. This method makes a key we can use safely for insertion.
123+
* <p>
124+
* If we know that the key refers to an immutable byte array and the key does not refer to a substring, we can
125+
* safely refer to the original byte array without needing to make an additional copy.
126+
*
127+
* @param isImmutable whether the key's byte array is immutable
128+
* @return a cache key suitable for insertion */
129+
public TBytesKey makeCacheable(boolean isImmutable) {
130+
if (isImmutable && isPerfectFit()) {
131+
return this;
132+
}
133+
134+
// Make a copy of the substring's bytes so we can cache them without retaining the larger original byte array.
135+
var resolvedSubstring = ArrayUtils.extractRange(this.bytes, this.offset, this.offset + this.length);
136+
137+
return new TBytesKey(resolvedSubstring, encoding);
138+
}
139+
140+
public TBytesKey withNewEncoding(RubyEncoding encoding) {
141+
return new TBytesKey(bytes, offset, length, bytesHashCode, encoding);
142+
}
143+
144+
public TruffleString toTruffleString() {
145+
return TStringUtils.fromByteArray(bytes, offset, length, encoding.tencoding);
146+
}
147+
65148
}

src/main/java/org/truffleruby/core/string/TStringCache.java

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
*/
1010
package org.truffleruby.core.string;
1111

12+
import com.oracle.truffle.api.strings.InternalByteArray;
1213
import com.oracle.truffle.api.strings.TruffleString;
1314
import org.truffleruby.collections.WeakValueCache;
1415
import org.truffleruby.core.encoding.Encodings;
@@ -20,6 +21,10 @@
2021
import com.oracle.truffle.api.CompilerDirectives;
2122
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
2223

24+
/** This cache caches (byte[], encoding) to TruffleString. The main value is from using it for string literals in files
25+
* without {@code # frozen_string_literal: true}, so equivalent string literals are shared. For most other usages there
26+
* is another higher-level cache but this cache then helps to deduplicate TruffleString's across the different
27+
* higher-level caches. */
2328
public final class TStringCache {
2429

2530
private final WeakValueCache<TBytesKey, TruffleString> bytesToTString = new WeakValueCache<>();
@@ -69,20 +74,38 @@ private void register(TruffleString tstring, RubyEncoding encoding) {
6974
}
7075
}
7176

72-
public TruffleString getTString(TruffleString string, RubyEncoding encoding) {
73-
return getTString(TStringUtils.getBytesOrCopy(string, encoding), encoding);
77+
@TruffleBoundary
78+
public TruffleString getTString(TruffleString string, RubyEncoding rubyEncoding) {
79+
assert rubyEncoding != null;
80+
81+
var byteArray = string.getInternalByteArrayUncached(rubyEncoding.tencoding);
82+
final TBytesKey key = new TBytesKey(byteArray, rubyEncoding);
83+
84+
return getTString(key, TStringUtils.hasImmutableInternalByteArray(string));
85+
}
86+
87+
@TruffleBoundary
88+
public TruffleString getTString(InternalByteArray byteArray, boolean isImmutable, RubyEncoding rubyEncoding) {
89+
assert rubyEncoding != null;
90+
91+
return getTString(new TBytesKey(byteArray, rubyEncoding), isImmutable);
7492
}
7593

7694
@TruffleBoundary
7795
public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
7896
assert rubyEncoding != null;
7997

80-
final TBytesKey key = new TBytesKey(bytes, rubyEncoding);
98+
return getTString(new TBytesKey(bytes, rubyEncoding), true);
99+
}
100+
101+
@TruffleBoundary
102+
private TruffleString getTString(TBytesKey lookupKey, boolean isLookupKeyImmutable) {
103+
final TruffleString tstring = bytesToTString.get(lookupKey);
104+
var rubyEncoding = lookupKey.getMatchedEncoding();
81105

82-
final TruffleString tstring = bytesToTString.get(key);
83106
if (tstring != null) {
84107
++tstringsReusedCount;
85-
tstringBytesSaved += tstring.byteLength(rubyEncoding.tencoding);
108+
tstringBytesSaved += tstring.byteLength(lookupKey.getMatchedEncoding().tencoding);
86109

87110
return tstring;
88111
}
@@ -92,7 +115,7 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
92115
// reference equality optimizations. So, do another search but with a marker encoding. The only guarantee
93116
// we can make about the resulting TruffleString is that it would have the same logical byte[], but that's good enough
94117
// for our purposes.
95-
TBytesKey keyNoEncoding = new TBytesKey(bytes, null);
118+
TBytesKey keyNoEncoding = lookupKey.withNewEncoding(null);
96119
final TruffleString tstringWithSameBytesButDifferentEncoding = bytesToTString.get(keyNoEncoding);
97120

98121
final TruffleString newTString;
@@ -104,12 +127,11 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
104127
++byteArrayReusedCount;
105128
tstringBytesSaved += newTString.byteLength(rubyEncoding.tencoding);
106129
} else {
107-
newTString = TStringUtils.fromByteArray(bytes, rubyEncoding);
130+
newTString = lookupKey.toTruffleString();
108131
}
109132

110133
// Use the new TruffleString bytes in the cache, so we do not keep bytes alive unnecessarily.
111-
final TBytesKey newKey = new TBytesKey(TStringUtils.getBytesOrCopy(newTString, rubyEncoding), rubyEncoding);
112-
return bytesToTString.addInCacheIfAbsent(newKey, newTString);
134+
return bytesToTString.addInCacheIfAbsent(lookupKey.makeCacheable(isLookupKeyImmutable), newTString);
113135
}
114136

115137
public boolean contains(TruffleString string, RubyEncoding encoding) {

0 commit comments

Comments
 (0)