Skip to content

Commit e64565f

Browse files
committed
Add CRCs for duplicate detection
1 parent b12a7f5 commit e64565f

File tree

13 files changed

+1979
-296
lines changed

13 files changed

+1979
-296
lines changed

Analyzer/AnalyzerTool.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@ namespace UnityDataTools.Analyzer;
88

99
public class AnalyzerTool
1010
{
11-
public int Analyze(string path, string databaseName, string searchPattern, bool extractReferences)
11+
public int Analyze(string path, string databaseName, string searchPattern, bool skipReferences)
1212
{
13-
using SQLiteWriter writer = new (databaseName, extractReferences);
13+
using SQLiteWriter writer = new (databaseName, skipReferences);
1414

1515
try
1616
{
@@ -46,7 +46,7 @@ public int Analyze(string path, string databaseName, string searchPattern, bool
4646

4747
Console.Write($"\rProcessing {i * 100 / files.Length}% ({i}/{files.Length}) {file}");
4848

49-
writer.WriteSerializedFile(serializedFileName, file);
49+
writer.WriteSerializedFile(serializedFileName, Path.GetDirectoryName(file) + Path.DirectorySeparatorChar);
5050
}
5151

5252
if (archive != null)
@@ -65,7 +65,7 @@ public int Analyze(string path, string databaseName, string searchPattern, bool
6565
{
6666
if (node.Flags.HasFlag(ArchiveNodeFlags.SerializedFile))
6767
{
68-
writer.WriteSerializedFile(node.Path, "/" + node.Path);
68+
writer.WriteSerializedFile(node.Path, "/");
6969
}
7070
}
7171
}

UnityFileSystem/TypeTreeReaders/PPtrReader.cs renamed to Analyzer/PPtrAndCrcProcessor.cs

Lines changed: 105 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,87 @@
11
using System;
2-
using System.Collections;
32
using System.Collections.Generic;
4-
using System.Linq;
53
using System.Text;
4+
using UnityDataTools.FileSystem;
5+
using Force.Crc32;
66

7-
namespace UnityDataTools.FileSystem.TypeTreeReaders;
7+
namespace UnityDataTools.Analyzer;
88

99
// This class is used to extract all the PPtrs in a serialized object. It executes a callback whenever a PPtr is found.
1010
// It provides a string representing the property path of the property (e.g. "m_MyObject.m_MyArray[2].m_PPtrProperty").
11-
public class PPtrReader
11+
public class PPtrAndCrcProcessor : IDisposable
1212
{
13-
public delegate void CallbackDelegate(long objectId, int fileId, long pathId, string propertyPath, string propertyType);
13+
public delegate int CallbackDelegate(long objectId, int fileId, long pathId, string propertyPath, string propertyType);
1414

1515
private SerializedFile m_SerializedFile;
1616
private UnityFileReader m_Reader;
1717
private long m_Offset;
1818
private long m_ObjectId;
19+
private uint m_Crc32;
20+
private string m_Folder;
1921
private StringBuilder m_StringBuilder = new();
22+
private byte[] m_pptrBytes = new byte[4];
2023

2124
private CallbackDelegate m_Callback;
2225

23-
public PPtrReader(SerializedFile serializedFile, UnityFileReader reader,
26+
private Dictionary<string, UnityFileReader> m_resourceReaders = new();
27+
28+
public PPtrAndCrcProcessor(SerializedFile serializedFile, UnityFileReader reader, string folder,
2429
CallbackDelegate callback)
2530
{
2631
m_SerializedFile = serializedFile;
2732
m_Reader = reader;
33+
m_Folder = folder;
2834
m_Callback = callback;
2935
}
36+
37+
public void Dispose()
38+
{
39+
foreach (var r in m_resourceReaders.Values)
40+
{
41+
r.Dispose();
42+
}
43+
44+
m_resourceReaders.Clear();
45+
}
3046

31-
public void Process(long objectId, long offset, TypeTreeNode node)
47+
private UnityFileReader GetResourceReader(string filename)
48+
{
49+
var slashPos = filename.LastIndexOf('/');
50+
if (slashPos > 0)
51+
{
52+
filename = filename.Remove(0, slashPos + 1);
53+
}
54+
55+
if (!m_resourceReaders.TryGetValue(filename, out var reader))
56+
{
57+
reader = new UnityFileReader(m_Folder + filename, 4 * 1024 * 1024);
58+
m_resourceReaders[filename] = reader;
59+
}
60+
61+
return reader;
62+
}
63+
64+
public uint Process(long objectId, long offset, TypeTreeNode node)
3265
{
3366
m_Offset = offset;
3467
m_ObjectId = objectId;
68+
m_Crc32 = 0;
3569

3670
foreach (var child in node.Children)
3771
{
3872
m_StringBuilder.Clear();
3973
m_StringBuilder.Append(child.Name);
4074
ProcessNode(child);
4175
}
76+
77+
return m_Crc32;
4278
}
4379

4480
private void ProcessNode(TypeTreeNode node)
4581
{
4682
if (node.IsBasicType)
4783
{
84+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Size, m_Crc32);
4885
m_Offset += node.Size;
4986
}
5087
else if (node.IsArray)
@@ -63,9 +100,57 @@ private void ProcessNode(TypeTreeNode node)
63100

64101
ExtractPPtr(referencedType);
65102
}
103+
else if (node.Type == "StreamingInfo")
104+
{
105+
if (node.Children.Count != 3)
106+
throw new Exception("Invalid StreamingInfo");
107+
108+
var offset = node.Children[0].Size == 4 ? m_Reader.ReadInt32(m_Offset) : m_Reader.ReadInt64(m_Offset);
109+
m_Offset += node.Children[0].Size;
110+
111+
var size = m_Reader.ReadInt32(m_Offset);
112+
m_Offset += 4;
113+
114+
var stringSize = m_Reader.ReadInt32(m_Offset);
115+
var filename = m_Reader.ReadString(m_Offset + 4, stringSize);
116+
m_Offset += stringSize + 4;
117+
m_Offset = (m_Offset + 3) & ~(3);
118+
119+
if (size > 0)
120+
{
121+
var resourceFile = GetResourceReader(filename);
122+
123+
m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32);
124+
}
125+
}
126+
else if (node.Type == "StreamedResource")
127+
{
128+
if (node.Children.Count != 3)
129+
throw new Exception("Invalid StreamedResource");
130+
131+
var stringSize = m_Reader.ReadInt32(m_Offset);
132+
var filename = m_Reader.ReadString(m_Offset + 4, stringSize);
133+
m_Offset += stringSize + 4;
134+
m_Offset = (m_Offset + 3) & ~(3);
135+
136+
var offset = m_Reader.ReadInt64(m_Offset);
137+
m_Offset += 8;
138+
139+
var size = (int)m_Reader.ReadInt64(m_Offset);
140+
m_Offset += 8;
141+
142+
if (size > 0)
143+
{
144+
var resourceFile = GetResourceReader(filename);
145+
146+
m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32);
147+
}
148+
}
66149
else if (node.CSharpType == typeof(string))
67150
{
151+
var prevOffset = m_Offset;
68152
m_Offset += m_Reader.ReadInt32(m_Offset) + 4;
153+
m_Crc32 = m_Reader.ComputeCRC(prevOffset, (int)(m_Offset - prevOffset), m_Crc32);
69154
}
70155
else if (node.IsManagedReferenceRegistry)
71156
{
@@ -99,10 +184,12 @@ private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry = f
99184
if (dataNode.IsBasicType)
100185
{
101186
var arraySize = m_Reader.ReadInt32(m_Offset);
187+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, dataNode.Size * arraySize + 4, m_Crc32);
102188
m_Offset += dataNode.Size * arraySize + 4;
103189
}
104190
else
105191
{
192+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, 4, m_Crc32);
106193
var arraySize = m_Reader.ReadInt32(m_Offset);
107194
m_Offset += 4;
108195

@@ -127,6 +214,7 @@ private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry = f
127214

128215
// First child is rid.
129216
long rid = m_Reader.ReadInt64(m_Offset);
217+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, 8, m_Crc32);
130218
m_Offset += 8;
131219

132220
ProcessManagedReferenceData(dataNode.Children[1], dataNode.Children[2], rid);
@@ -142,6 +230,7 @@ private void ProcessManagedReferenceRegistry(TypeTreeNode node)
142230

143231
// First child is version number.
144232
var version = m_Reader.ReadInt32(m_Offset);
233+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Children[0].Size, m_Crc32);
145234
m_Offset += node.Children[0].Size;
146235

147236
if (version == 1)
@@ -187,16 +276,19 @@ bool ProcessManagedReferenceData(TypeTreeNode refTypeNode, TypeTreeNode referenc
187276
throw new Exception("Invalid ReferencedManagedType");
188277

189278
var stringSize = m_Reader.ReadInt32(m_Offset);
279+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
190280
var className = m_Reader.ReadString(m_Offset + 4, stringSize);
191281
m_Offset += stringSize + 4;
192282
m_Offset = (m_Offset + 3) & ~(3);
193283

194284
stringSize = m_Reader.ReadInt32(m_Offset);
285+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
195286
var namespaceName = m_Reader.ReadString(m_Offset + 4, stringSize);
196287
m_Offset += stringSize + 4;
197288
m_Offset = (m_Offset + 3) & ~(3);
198289

199290
stringSize = m_Reader.ReadInt32(m_Offset);
291+
m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32);
200292
var assemblyName = m_Reader.ReadString(m_Offset + 4, stringSize);
201293
m_Offset += stringSize + 4;
202294
m_Offset = (m_Offset + 3) & ~(3);
@@ -229,7 +321,12 @@ private void ExtractPPtr(string referencedType)
229321

230322
if (fileId != 0 || pathId != 0)
231323
{
232-
m_Callback(m_ObjectId, fileId, pathId, m_StringBuilder.ToString(), referencedType);
324+
var refId = m_Callback(m_ObjectId, fileId, pathId, m_StringBuilder.ToString(), referencedType);
325+
m_pptrBytes[0] = (byte)(refId >> 24);
326+
m_pptrBytes[1] = (byte)(refId >> 16);
327+
m_pptrBytes[2] = (byte)(refId >> 8);
328+
m_pptrBytes[3] = (byte)(refId);
329+
m_Crc32 = Crc32Algorithm.Append(m_Crc32, m_pptrBytes);
233330
}
234331
}
235332
}

Analyzer/README.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ but it can easily be adapter to use another writer type. It takes four parameter
2222
recursively.
2323
* databaseName (string): database filename, it will be overwritten if it already exists.
2424
* searchPattern (string): file search pattern (e.g. \*.bundle).
25-
* extractReferences (bool): determines if the references (PPtrs) must be extracted and saved in
26-
the 'refs' table. Calling this method will create the SQLite output database and will recursively
27-
process the files matching the search pattern in the provided path. It will add a row in
28-
the 'objects' table for each serialized object. This table contain basic information such as the
29-
size and the name of the object (if it has one).
25+
* skipReferences (bool): determines if the CRC calculation and references (PPtrs) extraction must
26+
skipped. This is faster, but the refs table will be empty and the duplicate assets won't be
27+
accurate.
28+
Calling this method will create the SQLite output database and will recursively
29+
process the files matching the search pattern in the provided path. It will add a row in
30+
the 'objects' table for each serialized object. This table contain basic information such as the
31+
size and the name of the object (if it has one).
3032

3133
# How to use the database
3234

@@ -62,10 +64,8 @@ case, Unity will include the asset in all the AssetBundles with a reference to i
6264
view_potential_duplicates provides the number of instances and the total size of the potentially
6365
duplicated assets. It also lists all the AssetBundles where the asset was found.
6466

65-
It is important to understand that there is a lot of false positives in that view. All the objects
66-
having an identical name, size and type are reported as potential duplicates. For example, if
67-
several animated characters have a bone GameObject named "Hand_L", they will all be reported as
68-
potential duplicates even if they are not part of the same object.
67+
If the skipReferences option is used, there will be a lot of false positives in that view. Otherwise,
68+
it should be very accurate because CRCs are used to determine if objects are identical.
6969

7070
## asset_view (AssetBundleProcessor)
7171

Analyzer/Resources/Init.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ CASE
4949
WHEN size >= 1024 AND size < (1024 * 1024) THEN printf('%!5.1f KB', size / 1024.0)
5050
WHEN size >= (1024 * 1024) AND size < (1024 * 1024 * 1024) THEN printf('%!5.1f MB', size / 1024.0 / 1024)
5151
WHEN size >= (1024 * 1024 * 1024) THEN printf('%!5.1f GB', size / 1024.0 / 1024 / 1024)
52-
END AS pretty_size
52+
END AS pretty_size, o.crc32
5353
FROM objects o
5454
INNER JOIN types t ON o.type = t.id
5555
INNER JOIN serialized_files sf ON o.serialized_file = sf.id

0 commit comments

Comments
 (0)