8585#endif
8686
8787
88+
8889/*-************************************
8990* Dependency
9091**************************************/
117118# endif /* _MSC_VER */
118119#endif /* LZ4_FORCE_INLINE */
119120
121+ /* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
122+ * Gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy,
123+ * together with a simple 8-byte copy loop as a fall-back path.
124+ * However, this optimization hurts the decompression speed by >30%,
125+ * because the execution does not go to the optimized loop
126+ * for typical compressible data, and all of the preamble checks
127+ * before going to the fall-back path become useless overhead.
128+ * This optimization happens only with the -O3 flag, and -O2 generates
129+ * a simple 8-byte copy loop.
130+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy
131+ * functions are annotated with __attribute__((optimize("O2"))),
132+ * and also LZ4_wildCopy is forcibly inlined, so that the O2 attribute
133+ * of LZ4_wildCopy does not affect the compression speed.
134+ */
135+ #if defined(__PPC64__ ) && defined(__LITTLE_ENDIAN__ ) && defined(__GNUC__ )
136+ # define LZ4_FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
137+ # define LZ4_FORCE_O2_INLINE_GCC_PPC64LE __attribute__((optimize("O2"))) LZ4_FORCE_INLINE
138+ #else
139+ # define LZ4_FORCE_O2_GCC_PPC64LE
140+ # define LZ4_FORCE_O2_INLINE_GCC_PPC64LE static
141+ #endif
142+
120143#if (defined(__GNUC__ ) && (__GNUC__ >= 3 )) || (defined(__INTEL_COMPILER ) && (__INTEL_COMPILER >= 800 )) || defined(__clang__ )
121144# define expect (expr ,value ) (__builtin_expect ((expr),(value)) )
122145#else
@@ -253,7 +276,8 @@ static void LZ4_copy8(void* dst, const void* src)
253276}
254277
255278/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
256- static void LZ4_wildCopy (void * dstPtr , const void * srcPtr , void * dstEnd )
279+ LZ4_FORCE_O2_INLINE_GCC_PPC64LE
280+ void LZ4_wildCopy (void * dstPtr , const void * srcPtr , void * dstEnd )
257281{
258282 BYTE * d = (BYTE * )dstPtr ;
259283 const BYTE * s = (const BYTE * )srcPtr ;
@@ -289,15 +313,24 @@ static const int LZ4_minLength = (MFLIMIT+1);
289313/*-************************************
290314* Error detection
291315**************************************/
316+ #if defined(LZ4_DEBUG ) && (LZ4_DEBUG >=1 )
317+ # include <assert.h>
318+ #else
319+ # ifndef assert
320+ # define assert (condition ) ((void)0)
321+ # endif
322+ #endif
323+
292324#define LZ4_STATIC_ASSERT (c ) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */
293325
294326#if defined(LZ4_DEBUG ) && (LZ4_DEBUG >=2 )
295327# include <stdio.h>
296- # define DEBUGLOG (l , ...) { \
297- if (l<=LZ4_DEBUG) { \
298- fprintf(stderr, __FILE__ ": "); \
299- fprintf(stderr, __VA_ARGS__); \
300- fprintf(stderr, " \n"); \
328+ static int g_debuglog_enable = 1 ;
329+ # define DEBUGLOG (l , ...) { \
330+ if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \
331+ fprintf(stderr, __FILE__ ": "); \
332+ fprintf(stderr, __VA_ARGS__); \
333+ fprintf(stderr, " \n"); \
301334 } }
302335#else
303336# define DEBUGLOG (l , ...) {} /* disabled */
@@ -307,7 +340,7 @@ static const int LZ4_minLength = (MFLIMIT+1);
307340/*-************************************
308341* Common functions
309342**************************************/
310- static unsigned LZ4_NbCommonBytes (register reg_t val )
343+ static unsigned LZ4_NbCommonBytes (reg_t val )
311344{
312345 if (LZ4_isLittleEndian ()) {
313346 if (sizeof (val )== 8 ) {
@@ -318,7 +351,14 @@ static unsigned LZ4_NbCommonBytes (register reg_t val)
318351# elif (defined(__clang__ ) || (defined(__GNUC__ ) && (__GNUC__ >=3 ))) && !defined(LZ4_FORCE_SW_BITCOUNT )
319352 return (__builtin_ctzll ((U64 )val ) >> 3 );
320353# else
321- static const int DeBruijnBytePos [64 ] = { 0 , 0 , 0 , 0 , 0 , 1 , 1 , 2 , 0 , 3 , 1 , 3 , 1 , 4 , 2 , 7 , 0 , 2 , 3 , 6 , 1 , 5 , 3 , 5 , 1 , 3 , 4 , 4 , 2 , 5 , 6 , 7 , 7 , 0 , 1 , 2 , 3 , 3 , 4 , 6 , 2 , 6 , 5 , 5 , 3 , 4 , 5 , 6 , 7 , 1 , 2 , 4 , 6 , 4 , 4 , 5 , 7 , 2 , 6 , 5 , 7 , 6 , 7 , 7 };
354+ static const int DeBruijnBytePos [64 ] = { 0 , 0 , 0 , 0 , 0 , 1 , 1 , 2 ,
355+ 0 , 3 , 1 , 3 , 1 , 4 , 2 , 7 ,
356+ 0 , 2 , 3 , 6 , 1 , 5 , 3 , 5 ,
357+ 1 , 3 , 4 , 4 , 2 , 5 , 6 , 7 ,
358+ 7 , 0 , 1 , 2 , 3 , 3 , 4 , 6 ,
359+ 2 , 6 , 5 , 5 , 3 , 4 , 5 , 6 ,
360+ 7 , 1 , 2 , 4 , 6 , 4 , 4 , 5 ,
361+ 7 , 2 , 6 , 5 , 7 , 6 , 7 , 7 };
322362 return DeBruijnBytePos [((U64 )((val & - (long long )val ) * 0x0218A392CDABBD3FULL )) >> 58 ];
323363# endif
324364 } else /* 32 bits */ {
@@ -329,21 +369,27 @@ static unsigned LZ4_NbCommonBytes (register reg_t val)
329369# elif (defined(__clang__ ) || (defined(__GNUC__ ) && (__GNUC__ >=3 ))) && !defined(LZ4_FORCE_SW_BITCOUNT )
330370 return (__builtin_ctz ((U32 )val ) >> 3 );
331371# else
332- static const int DeBruijnBytePos [32 ] = { 0 , 0 , 3 , 0 , 3 , 1 , 3 , 0 , 3 , 2 , 2 , 1 , 3 , 2 , 0 , 1 , 3 , 3 , 1 , 2 , 2 , 2 , 2 , 0 , 3 , 1 , 2 , 0 , 1 , 0 , 1 , 1 };
372+ static const int DeBruijnBytePos [32 ] = { 0 , 0 , 3 , 0 , 3 , 1 , 3 , 0 ,
373+ 3 , 2 , 2 , 1 , 3 , 2 , 0 , 1 ,
374+ 3 , 3 , 1 , 2 , 2 , 2 , 2 , 0 ,
375+ 3 , 1 , 2 , 0 , 1 , 0 , 1 , 1 };
333376 return DeBruijnBytePos [((U32 )((val & - (S32 )val ) * 0x077CB531U )) >> 27 ];
334377# endif
335378 }
336379 } else /* Big Endian CPU */ {
337- if (sizeof (val )== 8 ) {
380+ if (sizeof (val )== 8 ) { /* 64-bits */
338381# if defined(_MSC_VER ) && defined(_WIN64 ) && !defined(LZ4_FORCE_SW_BITCOUNT )
339382 unsigned long r = 0 ;
340383 _BitScanReverse64 ( & r , val );
341384 return (unsigned )(r >>3 );
342385# elif (defined(__clang__ ) || (defined(__GNUC__ ) && (__GNUC__ >=3 ))) && !defined(LZ4_FORCE_SW_BITCOUNT )
343386 return (__builtin_clzll ((U64 )val ) >> 3 );
344387# else
388+ static const U32 by32 = sizeof (val )* 4 ; /* 32 on 64 bits (goal), 16 on 32 bits.
389+ Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
390+ Note that this code path is never triggered in 32-bits mode. */
345391 unsigned r ;
346- if (!(val >>32 )) { r = 4 ; } else { r = 0 ; val >>=32 ; }
392+ if (!(val >>by32 )) { r = 4 ; } else { r = 0 ; val >>=by32 ; }
347393 if (!(val >>16 )) { r += 2 ; val >>=8 ; } else { val >>=24 ; }
348394 r += (!val );
349395 return r ;
@@ -366,11 +412,20 @@ static unsigned LZ4_NbCommonBytes (register reg_t val)
366412}
367413
368414#define STEPSIZE sizeof(reg_t)
369- static unsigned LZ4_count (const BYTE * pIn , const BYTE * pMatch , const BYTE * pInLimit )
415+ LZ4_FORCE_INLINE
416+ unsigned LZ4_count (const BYTE * pIn , const BYTE * pMatch , const BYTE * pInLimit )
370417{
371418 const BYTE * const pStart = pIn ;
372419
373- while (likely (pIn < pInLimit - (STEPSIZE - 1 ))) {
420+ if (likely (pIn < pInLimit - (STEPSIZE - 1 ))) {
421+ reg_t const diff = LZ4_read_ARCH (pMatch ) ^ LZ4_read_ARCH (pIn );
422+ if (!diff ) {
423+ pIn += STEPSIZE ; pMatch += STEPSIZE ;
424+ } else {
425+ return LZ4_NbCommonBytes (diff );
426+ } }
427+
428+ while (likely (pIn < pInLimit - (STEPSIZE - 1 ))) {
374429 reg_t const diff = LZ4_read_ARCH (pMatch ) ^ LZ4_read_ARCH (pIn );
375430 if (!diff ) { pIn += STEPSIZE ; pMatch += STEPSIZE ; continue ; }
376431 pIn += LZ4_NbCommonBytes (diff );
@@ -944,6 +999,7 @@ LZ4_stream_t* LZ4_createStream(void)
944999
9451000void LZ4_resetStream (LZ4_stream_t * LZ4_stream )
9461001{
1002+ DEBUGLOG (4 , "LZ4_resetStream" );
9471003 MEM_INIT (LZ4_stream , 0 , sizeof (LZ4_stream_t ));
9481004}
9491005
@@ -1109,6 +1165,7 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
11091165 * Note that it is important for performance that this function really get inlined,
11101166 * in order to remove useless branches during compilation optimization.
11111167 */
1168+ LZ4_FORCE_O2_GCC_PPC64LE
11121169LZ4_FORCE_INLINE int LZ4_decompress_generic (
11131170 const char * const src ,
11141171 char * const dst ,
@@ -1119,7 +1176,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
11191176 int partialDecoding , /* full, partial */
11201177 int targetOutputSize , /* only used if partialDecoding==partial */
11211178 int dict , /* noDict, withPrefix64k, usingExtDict */
1122- const BYTE * const lowPrefix , /* == dst when no prefix */
1179+ const BYTE * const lowPrefix , /* always <= dst, == dst when no prefix */
11231180 const BYTE * const dictStart , /* only if dict==usingExtDict */
11241181 const size_t dictSize /* note : = 0 if noDict */
11251182 )
@@ -1133,15 +1190,15 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
11331190 BYTE * oexit = op + targetOutputSize ;
11341191
11351192 const BYTE * const dictEnd = (const BYTE * )dictStart + dictSize ;
1136- const unsigned dec32table [ ] = {0 , 1 , 2 , 1 , 4 , 4 , 4 , 4 };
1137- const int dec64table [] = {0 , 0 , 0 , -1 , 0 , 1 , 2 , 3 };
1193+ const unsigned inc32table [ 8 ] = {0 , 1 , 2 , 1 , 0 , 4 , 4 , 4 };
1194+ const int dec64table [8 ] = {0 , 0 , 0 , -1 , -4 , 1 , 2 , 3 };
11381195
11391196 const int safeDecode = (endOnInput == endOnInputSize );
11401197 const int checkOffset = ((safeDecode ) && (dictSize < (int )(64 KB )));
11411198
11421199
11431200 /* Special cases */
1144- if ((partialDecoding ) && (oexit > oend - MFLIMIT )) oexit = oend - MFLIMIT ; /* targetOutputSize too high => decode everything */
1201+ if ((partialDecoding ) && (oexit > oend - MFLIMIT )) oexit = oend - MFLIMIT ; /* targetOutputSize too high => just decode everything */
11451202 if ((endOnInput ) && (unlikely (outputSize == 0 ))) return ((srcSize == 1 ) && (* ip == 0 )) ? 0 : -1 ; /* Empty output buffer */
11461203 if ((!endOnInput ) && (unlikely (outputSize == 0 ))) return (* ip == 0 ?1 :-1 );
11471204
@@ -1151,8 +1208,27 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
11511208 const BYTE * match ;
11521209 size_t offset ;
11531210
1154- /* get literal length */
11551211 unsigned const token = * ip ++ ;
1212+
1213+ /* shortcut for common case :
1214+ * in most circumstances, we expect to decode small matches (<= 18 bytes) separated by few literals (<= 14 bytes).
1215+ * this shortcut was tested on x86 and x64, where it improves decoding speed.
1216+ * it has not yet been benchmarked on ARM, Power, mips, etc. */
1217+ if (((ip + 14 /*maxLL*/ + 2 /*offset*/ <= iend )
1218+ & (op + 14 /*maxLL*/ + 18 /*maxML*/ <= oend ))
1219+ & ((token < (15 <<ML_BITS )) & ((token & ML_MASK ) != 15 )) ) {
1220+ size_t const ll = token >> ML_BITS ;
1221+ size_t const off = LZ4_readLE16 (ip + ll );
1222+ const BYTE * const matchPtr = op + ll - off ; /* pointer underflow risk ? */
1223+ if ((off >= 18 ) /* do not deal with overlapping matches */ & (matchPtr >= lowPrefix )) {
1224+ size_t const ml = (token & ML_MASK ) + MINMATCH ;
1225+ memcpy (op , ip , 16 ); op += ll ; ip += ll + 2 /*offset*/ ;
1226+ memcpy (op , matchPtr , 18 ); op += ml ;
1227+ continue ;
1228+ }
1229+ }
1230+
1231+ /* decode literal length */
11561232 if ((length = (token >>ML_BITS )) == RUN_MASK ) {
11571233 unsigned s ;
11581234 do {
@@ -1230,14 +1306,13 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
12301306 /* copy match within block */
12311307 cpy = op + length ;
12321308 if (unlikely (offset < 8 )) {
1233- const int dec64 = dec64table [offset ];
12341309 op [0 ] = match [0 ];
12351310 op [1 ] = match [1 ];
12361311 op [2 ] = match [2 ];
12371312 op [3 ] = match [3 ];
1238- match += dec32table [offset ];
1313+ match += inc32table [offset ];
12391314 memcpy (op + 4 , match , 4 );
1240- match -= dec64 ;
1315+ match -= dec64table [ offset ] ;
12411316 } else { LZ4_copy8 (op , match ); match += 8 ; }
12421317 op += 8 ;
12431318
@@ -1254,7 +1329,7 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
12541329 LZ4_copy8 (op , match );
12551330 if (length > 16 ) LZ4_wildCopy (op + 8 , match + 8 , cpy );
12561331 }
1257- op = cpy ; /* correction */
1332+ op = cpy ; /* correction */
12581333 }
12591334
12601335 /* end of decoding */
@@ -1269,16 +1344,19 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
12691344}
12701345
12711346
1347+ LZ4_FORCE_O2_GCC_PPC64LE
12721348int LZ4_decompress_safe (const char * source , char * dest , int compressedSize , int maxDecompressedSize )
12731349{
12741350 return LZ4_decompress_generic (source , dest , compressedSize , maxDecompressedSize , endOnInputSize , full , 0 , noDict , (BYTE * )dest , NULL , 0 );
12751351}
12761352
1353+ LZ4_FORCE_O2_GCC_PPC64LE
12771354int LZ4_decompress_safe_partial (const char * source , char * dest , int compressedSize , int targetOutputSize , int maxDecompressedSize )
12781355{
12791356 return LZ4_decompress_generic (source , dest , compressedSize , maxDecompressedSize , endOnInputSize , partial , targetOutputSize , noDict , (BYTE * )dest , NULL , 0 );
12801357}
12811358
1359+ LZ4_FORCE_O2_GCC_PPC64LE
12821360int LZ4_decompress_fast (const char * source , char * dest , int originalSize )
12831361{
12841362 return LZ4_decompress_generic (source , dest , 0 , originalSize , endOnOutputSize , full , 0 , withPrefix64k , (BYTE * )(dest - 64 KB ), NULL , 64 KB );
@@ -1324,6 +1402,7 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
13241402 If it's not possible, save the relevant part of decoded data into a safe buffer,
13251403 and indicate where it stands using LZ4_setStreamDecode()
13261404*/
1405+ LZ4_FORCE_O2_GCC_PPC64LE
13271406int LZ4_decompress_safe_continue (LZ4_streamDecode_t * LZ4_streamDecode , const char * source , char * dest , int compressedSize , int maxOutputSize )
13281407{
13291408 LZ4_streamDecode_t_internal * lz4sd = & LZ4_streamDecode -> internal_donotuse ;
@@ -1350,6 +1429,7 @@ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const ch
13501429 return result ;
13511430}
13521431
1432+ LZ4_FORCE_O2_GCC_PPC64LE
13531433int LZ4_decompress_fast_continue (LZ4_streamDecode_t * LZ4_streamDecode , const char * source , char * dest , int originalSize )
13541434{
13551435 LZ4_streamDecode_t_internal * lz4sd = & LZ4_streamDecode -> internal_donotuse ;
@@ -1384,6 +1464,7 @@ Advanced decoding functions :
13841464 the dictionary must be explicitly provided within parameters
13851465*/
13861466
1467+ LZ4_FORCE_O2_GCC_PPC64LE
13871468LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic (const char * source , char * dest , int compressedSize , int maxOutputSize , int safe , const char * dictStart , int dictSize )
13881469{
13891470 if (dictSize == 0 )
@@ -1396,17 +1477,20 @@ LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char*
13961477 return LZ4_decompress_generic (source , dest , compressedSize , maxOutputSize , safe , full , 0 , usingExtDict , (BYTE * )dest , (const BYTE * )dictStart , dictSize );
13971478}
13981479
1480+ LZ4_FORCE_O2_GCC_PPC64LE
13991481int LZ4_decompress_safe_usingDict (const char * source , char * dest , int compressedSize , int maxOutputSize , const char * dictStart , int dictSize )
14001482{
14011483 return LZ4_decompress_usingDict_generic (source , dest , compressedSize , maxOutputSize , 1 , dictStart , dictSize );
14021484}
14031485
1486+ LZ4_FORCE_O2_GCC_PPC64LE
14041487int LZ4_decompress_fast_usingDict (const char * source , char * dest , int originalSize , const char * dictStart , int dictSize )
14051488{
14061489 return LZ4_decompress_usingDict_generic (source , dest , 0 , originalSize , 0 , dictStart , dictSize );
14071490}
14081491
14091492/* debug function */
1493+ LZ4_FORCE_O2_GCC_PPC64LE
14101494int LZ4_decompress_safe_forceExtDict (const char * source , char * dest , int compressedSize , int maxOutputSize , const char * dictStart , int dictSize )
14111495{
14121496 return LZ4_decompress_generic (source , dest , compressedSize , maxOutputSize , endOnInputSize , full , 0 , usingExtDict , (BYTE * )dest , (const BYTE * )dictStart , dictSize );
0 commit comments