熵密杯赛题1 4s爆破结束

LOV2

周末打了场比赛( 刷到 adwa的blog )，这道题需要爆破 2^{32} bit 并调用一些函数验证，据上面 adwa 博客说 python 300h，go 并发 8min （我自己的 M1 Pro 需要15分钟）
我自己对 Go 的极致（大概吧）优化不到五分钟（优化点包括但不限于：防止 gc、手动 make、将 sm3 的库中代码优化（你还能有库牛逼?.jpg）、大小核优化：大核心负责爆破的核心运算，小核心负责任务调度），

赛后与 rec 的队友交流时，对方提到可以利用 CUDA 提速（哥们有东西是真教啊），但我手头没有 NVDIA GPU

最后试了下 Metal (GPU) 编程，只需要不到25s，如果是最新款芯片，调整下核心数量，还可以更快：预计 M4 Pro 只需要 10s，不显示进度的话还能再快一些

Image description

M4 Pro 只需要 4s

package main

/*
#cgo CFLAGS: -x objective-c -fobjc-arc
#cgo LDFLAGS: -framework Metal -framework Foundation -framework CoreGraphics

#import <Metal/Metal.h>
#import <Foundation/Foundation.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

// Metal 设备和资源
id<MTLDevice> device;
id<MTLCommandQueue> commandQueue;
id<MTLComputePipelineState> computePipelineState;
id<MTLBuffer> candidateBuffer;
id<MTLBuffer> resultBuffer;
id<MTLBuffer> targetBuffer;
id<MTLBuffer> foundBuffer;

// SM3 Metal shader 源码
const char* sm3MetalSource = R"(
#include <metal_stdlib>
using namespace metal;

// SM3 常量
constant uint32_t SM3_IV[8] = {
    0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600,
    0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e
};

// 循环左移
inline uint32_t rotateLeft(uint32_t x, uint32_t n) {
    return (x << n) | (x >> (32 - n));
}

// SM3 函数
inline uint32_t ff0(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (x & z) | (y & z); }
inline uint32_t gg0(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z) { return (x & y) | (~x & z); }
inline uint32_t p0(uint32_t x) { return x ^ rotateLeft(x, 9) ^ rotateLeft(x, 17); }
inline uint32_t p1(uint32_t x) { return x ^ rotateLeft(x, 15) ^ rotateLeft(x, 23); }

// 字符映射
inline uchar indexToChar(uint64_t index) {
    const uchar chars[4] = {'a', 'b', 'c', 'd'};
    return chars[index & 3];
}

// SM3 核心计算 - 使用线程本地内存
void sm3_hash_local(thread const uchar* input, thread uchar* output) {
    uint32_t digest[8];
    for (int i = 0; i < 8; i++) {
        digest[i] = SM3_IV[i];
    }

    // 准备消息块
    uint32_t W[68];
    uint32_t W1[64];

    // 填充消息
    uchar padded[64];
    for (int i = 0; i < 32; i++) {
        padded[i] = input[i];
    }
    padded[32] = 0x80;
    for (int i = 33; i < 62; i++) {
        padded[i] = 0;
    }
    padded[62] = 0x01;
    padded[63] = 0x00;

    // 消息扩展
    for (int i = 0; i < 16; i++) {
        W[i] = ((uint32_t)padded[i*4] << 24) |
               ((uint32_t)padded[i*4+1] << 16) |
               ((uint32_t)padded[i*4+2] << 8) |
               ((uint32_t)padded[i*4+3]);
    }

    for (int i = 16; i < 68; i++) {
        W[i] = p1(W[i-16] ^ W[i-9] ^ rotateLeft(W[i-3], 15)) ^
               rotateLeft(W[i-13], 7) ^ W[i-6];
    }

    for (int i = 0; i < 64; i++) {
        W1[i] = W[i] ^ W[i+4];
    }

    // 压缩函数
    uint32_t A = digest[0], B = digest[1], C = digest[2], D = digest[3];
    uint32_t E = digest[4], F = digest[5], G = digest[6], H = digest[7];

    for (int i = 0; i < 16; i++) {
        uint32_t SS1 = rotateLeft(rotateLeft(A, 12) + E + rotateLeft(0x79cc4519, i), 7);
        uint32_t SS2 = SS1 ^ rotateLeft(A, 12);
        uint32_t TT1 = ff0(A, B, C) + D + SS2 + W1[i];
        uint32_t TT2 = gg0(E, F, G) + H + SS1 + W[i];
        D = C;
        C = rotateLeft(B, 9);
        B = A;
        A = TT1;
        H = G;
        G = rotateLeft(F, 19);
        F = E;
        E = p0(TT2);
    }

    for (int i = 16; i < 64; i++) {
        uint32_t SS1 = rotateLeft(rotateLeft(A, 12) + E + rotateLeft(0x7a879d8a, i), 7);
        uint32_t SS2 = SS1 ^ rotateLeft(A, 12);
        uint32_t TT1 = ff1(A, B, C) + D + SS2 + W1[i];
        uint32_t TT2 = gg1(E, F, G) + H + SS1 + W[i];
        D = C;
        C = rotateLeft(B, 9);
        B = A;
        A = TT1;
        H = G;
        G = rotateLeft(F, 19);
        F = E;
        E = p0(TT2);
    }

    // 最终哈希值
    digest[0] ^= A; digest[1] ^= B; digest[2] ^= C; digest[3] ^= D;
    digest[4] ^= E; digest[5] ^= F; digest[6] ^= G; digest[7] ^= H;

    // 输出大端序
    for (int i = 0; i < 8; i++) {
        output[i*4] = (digest[i] >> 24) & 0xff;
        output[i*4+1] = (digest[i] >> 16) & 0xff;
        output[i*4+2] = (digest[i] >> 8) & 0xff;
        output[i*4+3] = digest[i] & 0xff;
    }
}

// GPU 内核函数
kernel void sm3_search(
    device uchar* result [[buffer(0)]],      // 输出结果
    constant uchar* target [[buffer(1)]],    // 目标哈希
    device atomic_int* found [[buffer(2)]],  // 找到标志
    constant uint64_t* baseIndex [[buffer(3)]], // 基础索引
    uint3 gid [[thread_position_in_grid]]    // 线程ID
) {
    // 计算全局索引
    uint64_t globalId = gid.x + gid.y * 1024 + gid.z * 1024 * 1024;
    uint64_t candidateIndex = baseIndex[0] + globalId;

    // 检查是否已找到
    if (atomic_load_explicit(found, memory_order_relaxed) != 0) {
        return;
    }

    // 生成候选值 - 使用线程本地内存
    thread uchar candidate[32];

    // 固定前缀 "adcddbbadcacabad"
    candidate[0] = 'a'; candidate[1] = 'd'; candidate[2] = 'c'; candidate[3] = 'd';
    candidate[4] = 'd'; candidate[5] = 'b'; candidate[6] = 'b'; candidate[7] = 'a';
    candidate[8] = 'd'; candidate[9] = 'c'; candidate[10] = 'a'; candidate[11] = 'c';
    candidate[12] = 'a'; candidate[13] = 'b'; candidate[14] = 'a'; candidate[15] = 'd';

    // 生成后16字节
    uint64_t idx = candidateIndex;
    for (int i = 0; i < 16; i++) {
        candidate[16 + i] = indexToChar(idx);
        idx >>= 2;
    }

    // 计算哈希 - 使用线程本地内存
    thread uchar hash[32];
    sm3_hash_local(candidate, hash);

    // 比较结果
    bool match = true;
    for (int i = 0; i < 32; i++) {
        if (hash[i] != target[i]) {
            match = false;
            break;
        }
    }

    if (match) {
        // 找到了！
        atomic_store_explicit(found, 1, memory_order_relaxed);

        // 保存结果到全局内存
        for (int i = 0; i < 32; i++) {
            result[i] = candidate[i];
        }
    }
}
)";

// 获取GPU信息
typedef struct {
    int coreCount;
    int maxThreadsPerThreadgroup;
    int maxThreadgroupsPerMeshGrid;
    int registryID;
    char name[256];
} GPUInfo;

// 使用system_profiler获取准确的GPU核心数
int getGPUCoresFromSystemProfiler() {
    FILE *fp;
    char buffer[128];
    int cores = 0;

    // 执行system_profiler命令
    fp = popen("system_profiler SPDisplaysDataType | awk '/Total Number of Cores:/{print $5}'", "r");
    if (fp == NULL) {
        printf("Failed to run system_profiler command\n");
        return 0;
    }

    // 读取输出
    if (fgets(buffer, sizeof(buffer), fp) != NULL) {
        cores = atoi(buffer);
    }

    pclose(fp);
    return cores;
}

GPUInfo getGPUInfo() {
    GPUInfo info = {0};

    if (device) {
        // GPU名称
        strncpy(info.name, [[device name] UTF8String], 255);

        // 使用system_profiler获取准确的核心数
        info.coreCount = getGPUCoresFromSystemProfiler();

        // 如果获取失败，使用保守的默认值
        if (info.coreCount == 0) {
            printf("Warning: Could not detect GPU cores, using default value\n");
            info.coreCount = 8; // 保守估计
        }

        // 获取注册表ID
        info.registryID = (int)[device registryID];
    }

    return info;
}

// 初始化 Metal
int initMetal(GPUInfo* gpuInfo) {
    @autoreleasepool {
        NSError *error = nil;

        // 获取所有GPU设备
        NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
        if (devices.count > 0) {
            printf("Found %lu GPU devices:\n", devices.count);
            for (int i = 0; i < devices.count; i++) {
                id<MTLDevice> dev = devices[i];
                printf("  %d: %s\n", i, [[dev name] UTF8String]);
            }
            // 使用第一个设备（通常是最强大的）
            device = devices[0];
        } else {
            // 获取默认GPU设备
            device = MTLCreateSystemDefaultDevice();
        }

        if (!device) {
            printf("Metal is not supported on this device\n");
            return -1;
        }

        // 获取GPU详细信息
        *gpuInfo = getGPUInfo();

        printf("\n=== GPU Information ===\n");
        printf("GPU: %s\n", gpuInfo->name);
        printf("GPU Cores: %d\n", gpuInfo->coreCount);
        printf("Registry ID: %d\n", gpuInfo->registryID);

        // 输出GPU能力
        printf("\nGPU Capabilities:\n");
        printf("  Unified Memory: %s\n", [device hasUnifiedMemory] ? "YES" : "NO");
        printf("  Max Buffer Length: %.2f GB\n", (double)[device maxBufferLength] / (1024*1024*1024));
        printf("  Max Threads Per Threadgroup: %lu x %lu x %lu\n",
               [device maxThreadsPerThreadgroup].width,
               [device maxThreadsPerThreadgroup].height,
               [device maxThreadsPerThreadgroup].depth);

        if ([device respondsToSelector:@selector(recommendedMaxWorkingSetSize)]) {
            printf("  Recommended Max Working Set: %.2f GB\n",
                   (double)[device recommendedMaxWorkingSetSize] / (1024*1024*1024));
        }

        // GPU Family支持
        printf("\nGPU Family Support:\n");
        if ([device supportsFamily:MTLGPUFamilyApple8]) {
            printf("  Apple GPU Family 8 (M2)\n");
        } else if ([device supportsFamily:MTLGPUFamilyApple7]) {
            printf("  Apple GPU Family 7 (M1)\n");
        }

        // 创建命令队列
        commandQueue = [device newCommandQueue];
        if (!commandQueue) {
            printf("Failed to create command queue\n");
            return -1;
        }

        // 编译着色器
        NSString *source = [NSString stringWithUTF8String:sm3MetalSource];
        MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
        options.fastMathEnabled = YES;

        id<MTLLibrary> library = [device newLibraryWithSource:source options:options error:&error];
        if (!library) {
            printf("Failed to compile shader: %s\n", [[error description] UTF8String]);
            return -1;
        }

        // 获取内核函数
        id<MTLFunction> kernelFunction = [library newFunctionWithName:@"sm3_search"];
        if (!kernelFunction) {
            printf("Failed to find kernel function\n");
            return -1;
        }

        // 创建计算管线状态
        computePipelineState = [device newComputePipelineStateWithFunction:kernelFunction error:&error];
        if (!computePipelineState) {
            printf("Failed to create pipeline state: %s\n", [[error description] UTF8String]);
            return -1;
        }

        // 获取最大线程组大小
        gpuInfo->maxThreadsPerThreadgroup = (int)computePipelineState.maxTotalThreadsPerThreadgroup;
        printf("\nPipeline Info:\n");
        printf("  Max Threads Per Threadgroup: %d\n", gpuInfo->maxThreadsPerThreadgroup);
        printf("  Thread Execution Width: %lu\n", computePipelineState.threadExecutionWidth);

        // 创建缓冲区
        resultBuffer = [device newBufferWithLength:32 options:MTLResourceStorageModeShared];
        targetBuffer = [device newBufferWithLength:32 options:MTLResourceStorageModeShared];
        foundBuffer = [device newBufferWithLength:sizeof(int) options:MTLResourceStorageModeShared];
        candidateBuffer = [device newBufferWithLength:sizeof(uint64_t) options:MTLResourceStorageModeShared];

        if (!resultBuffer || !targetBuffer || !foundBuffer || !candidateBuffer) {
            printf("Failed to create buffers\n");
            return -1;
        }

        return 0;
    }
}

// 在GPU上搜索
int searchOnGPU(uint64_t startIndex, uint64_t count, const uint8_t* target, uint8_t* result, int maxThreadsPerThreadgroup) {
    @autoreleasepool {
        // 设置目标哈希
        memcpy([targetBuffer contents], target, 32);

        // 设置基础索引
        *(uint64_t*)[candidateBuffer contents] = startIndex;

        // 重置找到标志
        *(int*)[foundBuffer contents] = 0;

        // 创建命令缓冲区
        id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];
        if (!commandBuffer) {
            printf("Failed to create command buffer\n");
            return -1;
        }

        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
        if (!encoder) {
            printf("Failed to create compute encoder\n");
            return -1;
        }

        [encoder setComputePipelineState:computePipelineState];
        [encoder setBuffer:resultBuffer offset:0 atIndex:0];
        [encoder setBuffer:targetBuffer offset:0 atIndex:1];
        [encoder setBuffer:foundBuffer offset:0 atIndex:2];
        [encoder setBuffer:candidateBuffer offset:0 atIndex:3];

        // 计算线程组大小 - 根据GPU能力动态调整
        NSUInteger threadsPerThreadgroup = MIN(maxThreadsPerThreadgroup, 256);
        if (threadsPerThreadgroup > computePipelineState.maxTotalThreadsPerThreadgroup) {
            threadsPerThreadgroup = computePipelineState.maxTotalThreadsPerThreadgroup;
        }

        NSUInteger threadgroupsPerGrid = (count + threadsPerThreadgroup - 1) / threadsPerThreadgroup;

        // 限制总线程组数
        if (threadgroupsPerGrid > 65536) {
            threadgroupsPerGrid = 65536;
        }

        MTLSize threadsPerThreadgroupSize = MTLSizeMake(threadsPerThreadgroup, 1, 1);
        MTLSize threadgroupsPerGridSize = MTLSizeMake(threadgroupsPerGrid, 1, 1);

        // 分发计算
        [encoder dispatchThreadgroups:threadgroupsPerGridSize
                threadsPerThreadgroup:threadsPerThreadgroupSize];

        [encoder endEncoding];

        // 提交并等待完成
        [commandBuffer commit];
        [commandBuffer waitUntilCompleted];

        // 检查结果
        if (*(int*)[foundBuffer contents] != 0) {
            memcpy(result, [resultBuffer contents], 32);
            return 1;
        }

        return 0;
    }
}

// 清理资源
void cleanupMetal() {
    device = nil;
    commandQueue = nil;
    computePipelineState = nil;
    resultBuffer = nil;
    targetBuffer = nil;
    foundBuffer = nil;
    candidateBuffer = nil;
}
*/
import "C"

import (
	"context"
	"encoding/hex"
	"fmt"
	"log"
	"runtime"
	"sync"
	"sync/atomic"
	"time"
	"unsafe"

	"github.com/schollz/progressbar/v3"
)

// GPU 配置（动态获取）
var (
	GPUCores                 int
	MaxThreadsPerThreadgroup int
	GPUBatchSize             int
)

var (
	tarHex   = "aab05fca300811223b3b957bfe33130770fb7a6b55b030a5809c559344f66f79"
	tarBytes []byte
)

var (
	globalProgress atomic.Int64
	foundFlag      atomic.Int32
	foundResult    [32]byte
	resultMutex    sync.Mutex
)

func init() {
	var err error
	tarBytes, err = hex.DecodeString(tarHex)
	if err != nil {
		log.Fatalf("无法解码目标哈希: %v", err)
	}

	// 初始化 Metal 并获取GPU信息
	fmt.Println("初始化 Metal GPU...")
	var gpuInfo C.GPUInfo
	if ret := C.initMetal(&gpuInfo); ret != 0 {
		log.Fatalf("Metal 初始化失败")
	}

	// 设置GPU参数
	GPUCores = int(gpuInfo.coreCount)
	MaxThreadsPerThreadgroup = int(gpuInfo.maxThreadsPerThreadgroup)

	// 计算最优批处理大小
	// 考虑GPU核心数和最大线程数
	GPUBatchSize = GPUCores * MaxThreadsPerThreadgroup * 16 // 16倍过度订阅
	if GPUBatchSize > (1 << 22) {                           // 最大4M
		GPUBatchSize = 1 << 22
	}

	fmt.Printf("\n=== GPU配置 ===\n")
	fmt.Printf("GPU核心数: %d\n", GPUCores)
	fmt.Printf("最大线程组大小: %d\n", MaxThreadsPerThreadgroup)
	fmt.Printf("批处理大小: %d (%.2fM)\n", GPUBatchSize, float64(GPUBatchSize)/(1024*1024))
	fmt.Println("\nMetal GPU 初始化成功！")
}

func main() {
	// 使用所有CPU核心协调GPU任务
	runtime.GOMAXPROCS(runtime.NumCPU())

	totalOperations := int64(256 * (0xffffff + 1))
	bar := progressbar.NewOptions64(totalOperations,
		progressbar.OptionSetDescription(fmt.Sprintf("GPU加速版 (%d核GPU)...", GPUCores)),
		progressbar.OptionShowBytes(false),
		progressbar.OptionSetWidth(30),
		progressbar.OptionShowCount(),
		progressbar.OptionSetTheme(progressbar.Theme{
			Saucer: "=", SaucerHead: ">", SaucerPadding: " ",
			BarStart: "[", BarEnd: "]",
		}),
		progressbar.OptionThrottle(50*time.Millisecond),
	)

	// 创建任务队列
	jobs := make(chan uint64, 256)
	ctx, cancel := context.WithCancel(context.Background())

	wg := &sync.WaitGroup{}

	// 进度更新
	progressDone := make(chan struct{})
	go progressUpdater(bar, progressDone)

	// 启动GPU调度器
	numSchedulers := 4 // 使用4个调度器管理GPU任务
	for i := 0; i < numSchedulers; i++ {
		wg.Add(1)
		go gpuScheduler(i, wg, ctx, jobs)
	}

	timeStart := time.Now()

	// 分发任务
	fmt.Printf("\n正在使用 %d核GPU 进行并行计算...\n", GPUCores)
	fmt.Printf("每批次并行线程数: %d (%.2fM)\n", GPUBatchSize, float64(GPUBatchSize)/(1024*1024))
	fmt.Printf("最大线程组大小: %d\n\n", MaxThreadsPerThreadgroup)

	go func() {
		for j := uint64(0); j <= 0xff; j++ {
			select {
			case jobs <- j:
			case <-ctx.Done():
				return
			}
		}
		close(jobs)
	}()

	wg.Wait()
	cancel()
	close(progressDone)

	// 清理 Metal 资源
	C.cleanupMetal()

	timeEnd := time.Now()
	bar.Finish()

	duration := timeEnd.Sub(timeStart)
	totalHashes := globalProgress.Load()
	hashesPerSecond := float64(totalHashes) / duration.Seconds()

	fmt.Printf("\n=== GPU 性能统计 ===\n")
	fmt.Printf("GPU: %d核\n", GPUCores)
	fmt.Printf("总耗时: %v\n", duration)
	fmt.Printf("总哈希数: %d\n", totalHashes)
	fmt.Printf("哈希速率: %.2f MH/s\n", hashesPerSecond/1000000)
	fmt.Printf("每核心速率: %.2f MH/s\n", hashesPerSecond/1000000/float64(GPUCores))
	fmt.Printf("GPU吞吐量: %.2f GB/s\n", (hashesPerSecond*64)/(1024*1024*1024))

	if foundFlag.Load() != 0 {
		fmt.Printf("\n找到的结果: %s\n", string(foundResult[:]))
	}
}

func progressUpdater(bar *progressbar.ProgressBar, done <-chan struct{}) {
	ticker := time.NewTicker(50 * time.Millisecond)
	defer ticker.Stop()

	var lastProgress int64
	var lastTime time.Time = time.Now()
	var lastHashes int64

	for {
		select {
		case <-ticker.C:
			current := globalProgress.Load()
			if current > lastProgress {
				bar.Add64(current - lastProgress)

				// 计算实时速率
				now := time.Now()
				elapsed := now.Sub(lastTime).Seconds()
				if elapsed > 1.0 {
					rate := float64(current-lastHashes) / elapsed / 1000000
					bar.Describe(fmt.Sprintf("GPU计算中 (%.2f MH/s)...", rate))
					lastTime = now
					lastHashes = current
				}

				lastProgress = current
			}
		case <-done:
			current := globalProgress.Load()
			if current > lastProgress {
				bar.Add64(current - lastProgress)
			}
			return
		}
	}
}

func gpuScheduler(id int, wg *sync.WaitGroup, ctx context.Context, jobs <-chan uint64) {
	defer wg.Done()

	result := make([]byte, 32)

	for j := range jobs {
		if foundFlag.Load() != 0 {
			break
		}

		// 处理一个大任务块
		remaining := uint64(0xffffff + 1)
		offset := uint64(0)

		for remaining > 0 && foundFlag.Load() == 0 {
			// 计算这批的大小
			batchSize := uint64(GPUBatchSize)
			if batchSize > remaining {
				batchSize = remaining
			}

			startIndex := (j << 24) + offset

			// 在GPU上搜索
			ret := C.searchOnGPU(
				C.uint64_t(startIndex),
				C.uint64_t(batchSize),
				(*C.uint8_t)(unsafe.Pointer(&tarBytes[0])),
				(*C.uint8_t)(unsafe.Pointer(&result[0])),
				C.int(MaxThreadsPerThreadgroup),
			)

			if ret == 1 {
				// 找到了！
				foundFlag.Store(1)
				resultMutex.Lock()
				copy(foundResult[:], result)
				resultMutex.Unlock()
				fmt.Printf("\n[GPU Scheduler %d] 找到结果: %s\n", id, string(result))
				break
			}

			// 更新进度
			globalProgress.Add(int64(batchSize))

			offset += batchSize
			remaining -= batchSize

			// 检查上下文
			select {
			case <-ctx.Done():
				return
			default:
			}
		}
	}
}

Go 原生并发确实很适合做爆破（而且很简洁，就像下面这样

go func(){}()

为了完全理解它，我们可以将其分解为三个部分：

下面是来自 AI 的解释（已 Review）

go 关键字: 这是启动一个 goroutine 的指令。当你将 go 放在一个函数调用之前，Go 的运行时系统会创建一个新的 goroutine，并在其中执行这个函数，而不会阻塞当前的执行流程。这使得主程序可以继续执行其他任务，而无需等待这个新启动的函数完成。

func(){...}: 这是一个匿名函数的定义。与常规函数不同，匿名函数没有名称，它在需要时被“内联”定义。

func 关键字表明正在定义一个函数。
() 内可以定义函数的参数列表。
{...} 是函数体，包含了将在新 goroutine 中执行的代码。

(): 这是对前面定义的匿名函数的立即调用。紧跟在函数体 } 之后的 () 表示立即执行这个匿名函数。如果函数定义了参数，实际的参数值会在这里传递进去。