404

[ Avaa Bypassed ]




Upload:

Command:

botdev@18.221.139.13: ~ $
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_HASH_H
#define _ASM_HASH_H

/*
 * HP-PA only implements integer multiply in the FPU.  However, for
 * integer multiplies by constant, it has a number of shift-and-add
 * (but no shift-and-subtract, sigh!) instructions that a compiler
 * can synthesize a code sequence with.
 *
 * Unfortunately, GCC isn't very efficient at using them.  For example
 * it uses three instructions for "x *= 21" when only two are needed.
 * But we can find a sequence manually.
 */

#define HAVE_ARCH__HASH_32 1

/*
 * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the
 * PA7100 pairing rules.  This is an in-order 2-way superscalar processor.
 * Only one instruction in a pair may be a shift (by more than 3 bits),
 * but other than that, simple ALU ops (including shift-and-add by up
 * to 3 bits) may be paired arbitrarily.
 *
 * PA8xxx processors also dual-issue ALU instructions, although with
 * fewer constraints, so this schedule is good for them, too.
 *
 * This 6-step sequence was found by Yevgen Voronenko's implementation
 * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html.
 */
static inline u32 __attribute_const__ __hash_32(u32 x)
{
	u32 a, b, c;

	/*
	 * Phase 1: Compute  a = (x << 19) + x,
	 * b = (x << 9) + a, c = (x << 23) + b.
	 */
	a = x << 19;		/* Two shifts can't be paired */
	b = x << 9;	a += x;
	c = x << 23;	b += a;
			c += b;
	/* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */
	b <<= 11;
	a += c << 3;	b -= c;
	return (a << 3) + b;
}

#if BITS_PER_LONG == 64

#define HAVE_ARCH_HASH_64 1

/*
 * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky,
 * because available software for the purpose chokes on constants this
 * large.  (It's mostly designed for compiling FIR filter coefficients
 * into FPGAs.)
 *
 * However, Jason Thong pointed out a work-around.  The Hcub software
 * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple*
 * constant multiplication, and is good at finding shift-and-add chains
 * which share common terms.
 *
 * Looking at 0x0x61C8864680B583EB in binary:
 * 0110000111001000100001100100011010000000101101011000001111101011
 *  \______________/    \__________/       \_______/     \________/
 *   \____________________________/         \____________________/
 * you can see the non-zero bits are divided into several well-separated
 * blocks.  Hcub can find algorithms for those terms separately, which
 * can then be shifted and added together.
 *
 * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions
 * with 10, 9 or 8 adds, respectively, making a total of 11 for the
 * whole number.
 *
 * Using just two large blocks, 0xC3910C8D << 31 in the high bits,
 * and 0xB583EB in the low bits, produces as good an algorithm as any,
 * and with one more small shift than alternatives.
 *
 * The high bits are a larger number and more work to compute, as well
 * as needing one extra cycle to shift left 31 bits before the final
 * addition, so they are the critical path for scheduling.  The low bits
 * can fit into the scheduling slots left over.
 */


/*
 * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC
 * from inferring anything about the value assigned to "dest".
 *
 * This prevents it from mis-optimizing certain sequences.
 * In particular, gcc is annoyingly eager to combine consecutive shifts.
 * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into
 * "y += x << 19; z += x << 20;" even though the latter sequence needs
 * an additional instruction and temporary register.
 *
 * Because no actual assembly code is generated, this construct is
 * usefully portable across all GCC platforms, and so can be test-compiled
 * on non-PA systems.
 *
 * In two places, additional unused input dependencies are added.  This
 * forces GCC's scheduling so it does not rearrange instructions too much.
 * Because the PA-8xxx is out of order, I'm not sure how much this matters,
 * but why make it more difficult for the processor than necessary?
 */
#define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__)

/*
 * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily
 * optimized shift-and-add sequence.
 *
 * Without the final shift, the multiply proper is 19 instructions,
 * 10 cycles and uses only 4 temporaries.  Whew!
 *
 * You are not expected to understand this.
 */
static __always_inline u32 __attribute_const__
hash_64(u64 a, unsigned int bits)
{
	u64 b, c, d;

	/*
	 * Encourage GCC to move a dynamic shift to %sar early,
	 * thereby freeing up an additional temporary register.
	 */
	if (!__builtin_constant_p(bits))
		asm("" : "=q" (bits) : "0" (64 - bits));
	else
		bits = 64 - bits;

	_ASSIGN(b, a*5);	c = a << 13;
	b = (b << 2) + a;	_ASSIGN(d, a << 17);
	a = b + (a << 1);	c += d;
	d = a << 10;		_ASSIGN(a, a << 19);
	d = a - d;		_ASSIGN(a, a << 4, "X" (d));
	c += b;			a += b;
	d -= c;			c += a << 1;
	a += c << 3;		_ASSIGN(b, b << (7+31), "X" (c), "X" (d));
	a <<= 31;		b += d;
	a += b;
	return a >> bits;
}
#undef _ASSIGN	/* We're a widely-used header file, so don't litter! */

#endif /* BITS_PER_LONG == 64 */

#endif /* _ASM_HASH_H */

Filemanager

Name Type Size Permission Actions
Kbuild File 610 B 0644
agp.h File 596 B 0644
asm-offsets.h File 35 B 0644
asmregs.h File 3.04 KB 0644
assembly.h File 12.94 KB 0644
atomic.h File 8.24 KB 0644
barrier.h File 2.44 KB 0644
bitops.h File 5.89 KB 0644
bug.h File 2.35 KB 0644
bugs.h File 340 B 0644
cache.h File 1.59 KB 0644
cacheflush.h File 4.06 KB 0644
checksum.h File 5.48 KB 0644
cmpxchg.h File 3.62 KB 0644
compat.h File 6.52 KB 0644
compat_ucontext.h File 591 B 0644
delay.h File 533 B 0644
dma-mapping.h File 2.42 KB 0644
dma.h File 5.71 KB 0644
dwarf.h File 602 B 0644
eisa_bus.h File 702 B 0644
eisa_eeprom.h File 4.42 KB 0644
elf.h File 14.31 KB 0644
fb.h File 403 B 0644
fixmap.h File 1.15 KB 0644
floppy.h File 6.61 KB 0644
ftrace.h File 379 B 0644
futex.h File 2.66 KB 0644
grfioctl.h File 4.68 KB 0644
hardirq.h File 1.3 KB 0644
hardware.h File 4.09 KB 0644
hash.h File 5.07 KB 0644
hugetlb.h File 1.67 KB 0644
ide.h File 1.09 KB 0644
io.h File 8.14 KB 0644
irq.h File 1.23 KB 0644
irqflags.h File 1.02 KB 0644
kbdleds.h File 477 B 0644
kmap_types.h File 221 B 0644
ldcw.h File 2.15 KB 0644
led.h File 1.33 KB 0644
linkage.h File 759 B 0644
machdep.h File 349 B 0644
mckinley.h File 270 B 0644
mmu.h File 195 B 0644
mmu_context.h File 2.08 KB 0644
mmzone.h File 1.51 KB 0644
module.h File 527 B 0644
page.h File 5.42 KB 0644
parisc-device.h File 1.92 KB 0644
parport.h File 358 B 0644
pci.h File 6.57 KB 0644
pdc.h File 3.93 KB 0644
pdc_chassis.h File 15.06 KB 0644
pdcpat.h File 15 KB 0644
perf.h File 1.89 KB 0644
perf_event.h File 152 B 0644
pgalloc.h File 4.22 KB 0644
pgtable.h File 18.92 KB 0644
prefetch.h File 1.12 KB 0644
processor.h File 9.86 KB 0644
psw.h File 2.39 KB 0644
ptrace.h File 803 B 0644
ropes.h File 9.73 KB 0644
rt_sigframe.h File 745 B 0644
runway.h File 320 B 0644
sections.h File 283 B 0644
serial.h File 124 B 0644
shmparam.h File 263 B 0644
signal.h File 841 B 0644
smp.h File 1.25 KB 0644
socket.h File 311 B 0644
special_insns.h File 1015 B 0644
spinlock.h File 4.02 KB 0644
spinlock_types.h File 483 B 0644
string.h File 247 B 0644
superio.h File 3.25 KB 0644
switch_to.h File 332 B 0644
syscall.h File 1.4 KB 0644
termios.h File 1.72 KB 0644
thread_info.h File 3.15 KB 0644
timex.h File 372 B 0644
tlb.h File 672 B 0644
tlbflush.h File 2.63 KB 0644
topology.h File 900 B 0644
traps.h File 468 B 0644
uaccess.h File 6.55 KB 0644
ucontext.h File 327 B 0644
unaligned.h File 472 B 0644
unistd.h File 5.47 KB 0644
unwind.h File 2.56 KB 0644