Merge tag 'crc-for-linus' of git://git.kernel.org/pub/scm/linux/kerne…

…l/git/ebiggers/linux Pull CRC cleanups from Eric Biggers: "Simplify the kconfig options for controlling which CRC implementations are built into the kernel, as was requested by Linus. This means making the option to disable the arch code visible only when CONFIG_EXPERT=y, and standardizing on a single generic implementation of CRC32" * tag 'crc-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: lib/crc32: remove other generic implementations lib/crc: simplify the kconfig options for CRC implementations
mariux64 · Jan 29, 2025 · fed3819 · fed3819
2 parents af13ff1 + 5e3c1c4
commit fed3819
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 462 deletions.
diff --git a/lib/Kconfig b/lib/Kconfig
@@ -164,34 +164,9 @@ config CRC_T10DIF
 config ARCH_HAS_CRC_T10DIF
 	bool
 
-choice
-	prompt "CRC-T10DIF implementation"
-	depends on CRC_T10DIF
-	default CRC_T10DIF_IMPL_ARCH if ARCH_HAS_CRC_T10DIF
-	default CRC_T10DIF_IMPL_GENERIC if !ARCH_HAS_CRC_T10DIF
-	help
-	  This option allows you to override the default choice of CRC-T10DIF
-	  implementation.
-
-config CRC_T10DIF_IMPL_ARCH
-	bool "Architecture-optimized" if ARCH_HAS_CRC_T10DIF
-	help
-	  Use the optimized implementation of CRC-T10DIF for the selected
-	  architecture.  It is recommended to keep this enabled, as it can
-	  greatly improve CRC-T10DIF performance.
-
-config CRC_T10DIF_IMPL_GENERIC
-	bool "Generic implementation"
-	help
-	  Use the generic table-based implementation of CRC-T10DIF.  Selecting
-	  this will reduce code size slightly but can greatly reduce CRC-T10DIF
-	  performance.
-
-endchoice
-
 config CRC_T10DIF_ARCH
 	tristate
-	default CRC_T10DIF if CRC_T10DIF_IMPL_ARCH
+	default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS
 
 config CRC64_ROCKSOFT
 	tristate "CRC calculation for the Rocksoft model CRC64"
@@ -223,87 +198,9 @@ config CRC32
 config ARCH_HAS_CRC32
 	bool
 
-choice
-	prompt "CRC32 implementation"
-	depends on CRC32
-	default CRC32_IMPL_ARCH_PLUS_SLICEBY8 if ARCH_HAS_CRC32
-	default CRC32_IMPL_SLICEBY8 if !ARCH_HAS_CRC32
-	help
-	  This option allows you to override the default choice of CRC32
-	  implementation.  Choose the default unless you know that you need one
-	  of the others.
-
-config CRC32_IMPL_ARCH_PLUS_SLICEBY8
-	bool "Arch-optimized, with fallback to slice-by-8" if ARCH_HAS_CRC32
-	help
-	  Use architecture-optimized implementation of CRC32.  Fall back to
-	  slice-by-8 in cases where the arch-optimized implementation cannot be
-	  used, e.g. if the CPU lacks support for the needed instructions.
-
-	  This is the default when an arch-optimized implementation exists.
-
-config CRC32_IMPL_ARCH_PLUS_SLICEBY1
-	bool "Arch-optimized, with fallback to slice-by-1" if ARCH_HAS_CRC32
-	help
-	  Use architecture-optimized implementation of CRC32, but fall back to
-	  slice-by-1 instead of slice-by-8 in order to reduce the binary size.
-
-config CRC32_IMPL_SLICEBY8
-	bool "Slice by 8 bytes"
-	help
-	  Calculate checksum 8 bytes at a time with a clever slicing algorithm.
-	  This is much slower than the architecture-optimized implementation of
-	  CRC32 (if the selected arch has one), but it is portable and is the
-	  fastest implementation when no arch-optimized implementation is
-	  available.  It uses an 8KiB lookup table.  Most modern processors have
-	  enough cache to hold this table without thrashing the cache.
-
-config CRC32_IMPL_SLICEBY4
-	bool "Slice by 4 bytes"
-	help
-	  Calculate checksum 4 bytes at a time with a clever slicing algorithm.
-	  This is a bit slower than slice by 8, but has a smaller 4KiB lookup
-	  table.
-
-	  Only choose this option if you know what you are doing.
-
-config CRC32_IMPL_SLICEBY1
-	bool "Slice by 1 byte (Sarwate's algorithm)"
-	help
-	  Calculate checksum a byte at a time using Sarwate's algorithm.  This
-	  is not particularly fast, but has a small 1KiB lookup table.
-
-	  Only choose this option if you know what you are doing.
-
-config CRC32_IMPL_BIT
-	bool "Classic Algorithm (one bit at a time)"
-	help
-	  Calculate checksum one bit at a time.  This is VERY slow, but has
-	  no lookup table.  This is provided as a debugging option.
-
-	  Only choose this option if you are debugging crc32.
-
-endchoice
-
 config CRC32_ARCH
 	tristate
-	default CRC32 if CRC32_IMPL_ARCH_PLUS_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
-
-config CRC32_SLICEBY8
-	bool
-	default y if CRC32_IMPL_SLICEBY8 || CRC32_IMPL_ARCH_PLUS_SLICEBY8
-
-config CRC32_SLICEBY4
-	bool
-	default y if CRC32_IMPL_SLICEBY4
-
-config CRC32_SARWATE
-	bool
-	default y if CRC32_IMPL_SLICEBY1 || CRC32_IMPL_ARCH_PLUS_SLICEBY1
-
-config CRC32_BIT
-	bool
-	default y if CRC32_IMPL_BIT
+	default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS
 
 config CRC64
 	tristate "CRC64 functions"
@@ -343,6 +240,17 @@ config CRC8
 	  when they need to do cyclic redundancy check according CRC8
 	  algorithm. Module will be called crc8.
 
+config CRC_OPTIMIZATIONS
+	bool "Enable optimized CRC implementations" if EXPERT
+	default y
+	help
+	  Disabling this option reduces code size slightly by disabling the
+	  architecture-optimized implementations of any CRC variants that are
+	  enabled.  CRC checksumming performance may get much slower.
+
+	  Keep this enabled unless you're really trying to minimize the size of
+	  the kernel.
+
 config XXHASH
 	tristate
 

diff --git a/lib/crc32.c b/lib/crc32.c
@@ -30,178 +30,27 @@
 #include <linux/crc32poly.h>
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/sched.h>
-#include "crc32defs.h"
-
-#if CRC_LE_BITS > 8
-# define tole(x) ((__force u32) cpu_to_le32(x))
-#else
-# define tole(x) (x)
-#endif
-
-#if CRC_BE_BITS > 8
-# define tobe(x) ((__force u32) cpu_to_be32(x))
-#else
-# define tobe(x) (x)
-#endif
 
 #include "crc32table.h"
 
 MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
 MODULE_DESCRIPTION("Various CRC32 calculations");
 MODULE_LICENSE("GPL");
 
-#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8
-
-/* implements slicing-by-4 or slicing-by-8 algorithm */
-static inline u32 __pure
-crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
-{
-# ifdef __LITTLE_ENDIAN
-#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
-#  define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \
-		   t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255])
-#  define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \
-		   t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255])
-# else
-#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
-#  define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \
-		   t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255])
-#  define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \
-		   t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255])
-# endif
-	const u32 *b;
-	size_t    rem_len;
-# ifdef CONFIG_X86
-	size_t i;
-# endif
-	const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
-# if CRC_LE_BITS != 32
-	const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
-# endif
-	u32 q;
-
-	/* Align it */
-	if (unlikely((long)buf & 3 && len)) {
-		do {
-			DO_CRC(*buf++);
-		} while ((--len) && ((long)buf)&3);
-	}
-
-# if CRC_LE_BITS == 32
-	rem_len = len & 3;
-	len = len >> 2;
-# else
-	rem_len = len & 7;
-	len = len >> 3;
-# endif
-
-	b = (const u32 *)buf;
-# ifdef CONFIG_X86
-	--b;
-	for (i = 0; i < len; i++) {
-# else
-	for (--b; len; --len) {
-# endif
-		q = crc ^ *++b; /* use pre increment for speed */
-# if CRC_LE_BITS == 32
-		crc = DO_CRC4;
-# else
-		crc = DO_CRC8;
-		q = *++b;
-		crc ^= DO_CRC4;
-# endif
-	}
-	len = rem_len;
-	/* And the last few bytes */
-	if (len) {
-		u8 *p = (u8 *)(b + 1) - 1;
-# ifdef CONFIG_X86
-		for (i = 0; i < len; i++)
-			DO_CRC(*++p); /* use pre increment for speed */
-# else
-		do {
-			DO_CRC(*++p); /* use pre increment for speed */
-		} while (--len);
-# endif
-	}
-	return crc;
-#undef DO_CRC
-#undef DO_CRC4
-#undef DO_CRC8
-}
-#endif
-
-
-/**
- * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
- *			CRC32/CRC32C
- * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
- *	 uses, or the previous crc32/crc32c value if computing incrementally.
- * @p: pointer to buffer over which CRC32/CRC32C is run
- * @len: length of buffer @p
- * @tab: little-endian Ethernet table
- * @polynomial: CRC32/CRC32c LE polynomial
- */
-static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
-					  size_t len, const u32 (*tab)[256],
-					  u32 polynomial)
+u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
 {
-#if CRC_LE_BITS == 1
-	int i;
-	while (len--) {
-		crc ^= *p++;
-		for (i = 0; i < 8; i++)
-			crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
-	}
-# elif CRC_LE_BITS == 2
-	while (len--) {
-		crc ^= *p++;
-		crc = (crc >> 2) ^ tab[0][crc & 3];
-		crc = (crc >> 2) ^ tab[0][crc & 3];
-		crc = (crc >> 2) ^ tab[0][crc & 3];
-		crc = (crc >> 2) ^ tab[0][crc & 3];
-	}
-# elif CRC_LE_BITS == 4
-	while (len--) {
-		crc ^= *p++;
-		crc = (crc >> 4) ^ tab[0][crc & 15];
-		crc = (crc >> 4) ^ tab[0][crc & 15];
-	}
-# elif CRC_LE_BITS == 8
-	/* aka Sarwate algorithm */
-	while (len--) {
-		crc ^= *p++;
-		crc = (crc >> 8) ^ tab[0][crc & 255];
-	}
-# else
-	crc = (__force u32) __cpu_to_le32(crc);
-	crc = crc32_body(crc, p, len, tab);
-	crc = __le32_to_cpu((__force __le32)crc);
-#endif
+	while (len--)
+		crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
 	return crc;
 }
+EXPORT_SYMBOL(crc32_le_base);
 
-#if CRC_LE_BITS == 1
-u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
-{
-	return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
-}
-u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
-{
-	return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE);
-}
-#else
-u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len)
-{
-	return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE);
-}
 u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len)
 {
-	return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
+	while (len--)
+		crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
+	return crc;
 }
-#endif
-EXPORT_SYMBOL(crc32_le_base);
 EXPORT_SYMBOL(crc32c_le_base);
 
 /*
@@ -277,64 +126,10 @@ u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len)
 EXPORT_SYMBOL(crc32_le_shift);
 EXPORT_SYMBOL(__crc32c_le_shift);
 
-/**
- * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
- * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
- *	other uses, or the previous crc32 value if computing incrementally.
- * @p: pointer to buffer over which CRC32 is run
- * @len: length of buffer @p
- * @tab: big-endian Ethernet table
- * @polynomial: CRC32 BE polynomial
- */
-static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
-					  size_t len, const u32 (*tab)[256],
-					  u32 polynomial)
-{
-#if CRC_BE_BITS == 1
-	int i;
-	while (len--) {
-		crc ^= *p++ << 24;
-		for (i = 0; i < 8; i++)
-			crc =
-			    (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
-					  0);
-	}
-# elif CRC_BE_BITS == 2
-	while (len--) {
-		crc ^= *p++ << 24;
-		crc = (crc << 2) ^ tab[0][crc >> 30];
-		crc = (crc << 2) ^ tab[0][crc >> 30];
-		crc = (crc << 2) ^ tab[0][crc >> 30];
-		crc = (crc << 2) ^ tab[0][crc >> 30];
-	}
-# elif CRC_BE_BITS == 4
-	while (len--) {
-		crc ^= *p++ << 24;
-		crc = (crc << 4) ^ tab[0][crc >> 28];
-		crc = (crc << 4) ^ tab[0][crc >> 28];
-	}
-# elif CRC_BE_BITS == 8
-	while (len--) {
-		crc ^= *p++ << 24;
-		crc = (crc << 8) ^ tab[0][crc >> 24];
-	}
-# else
-	crc = (__force u32) __cpu_to_be32(crc);
-	crc = crc32_body(crc, p, len, tab);
-	crc = __be32_to_cpu((__force __be32)crc);
-# endif
-	return crc;
-}
-
-#if CRC_BE_BITS == 1
-u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
-{
-	return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
-}
-#else
 u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len)
 {
-	return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE);
+	while (len--)
+		crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
+	return crc;
 }
-#endif
 EXPORT_SYMBOL(crc32_be_base);