iOS에서 배열의 신속한 금속 병렬 합계 계산

@Kametrixom answer을 기반으로 배열의 합계를 병렬 계산하는 테스트 애플리케이션을 만들었습니다.iOS에서 배열의 신속한 금속 병렬 합계 계산

내 테스트 응용 프로그램은 다음과 같습니다

import UIKit 
import Metal 

class ViewController: UIViewController { 

// Data type, has to be the same as in the shader 
typealias DataType = CInt 

override func viewDidLoad() { 
    super.viewDidLoad() 

    let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated 


    var start, end : UInt64 


    var result:DataType = 0 
    start = mach_absolute_time() 
    data.withUnsafeBufferPointer { buffer in 
     for elem in buffer { 
      result += elem 
     } 
    } 
    end = mach_absolute_time() 

    print("CPU result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))") 

    result = 0 


    start = mach_absolute_time() 
    result = sumParallel4(data) 
    end = mach_absolute_time() 

    print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))") 


    result = 0 

    start = mach_absolute_time() 
    result = sumParralel(data) 
    end = mach_absolute_time() 

    print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))") 

    result = 0 

    start = mach_absolute_time() 
    result = sumParallel3(data) 
    end = mach_absolute_time() 

    print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))") 





} 

func sumParralel(data : Array<DataType>) -> DataType { 

    let count = data.count 
    let elementsPerSum: Int = Int(sqrt(Double(count))) 

    let device = MTLCreateSystemDefaultDevice()! 
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")! 
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum) 


    var dataCount = CUnsignedInt(count) 
    var elementsPerSumC = CUnsignedInt(elementsPerSum) 
    let resultsCount = (count + elementsPerSum - 1)/elementsPerSum // Number of individual results = count/elementsPerSum (rounded up) 


    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied) 
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized) 
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later 

    let queue = device.newCommandQueue() 
    let cmds = queue.commandBuffer() 
    let encoder = cmds.computeCommandEncoder() 

    encoder.setComputePipelineState(pipeline) 

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0) 
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1) 
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2) 
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3) 

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount`/`threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads 
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1)/pipeline.threadExecutionWidth, height: 1, depth: 1) 

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times) 
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1) 

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) 
    encoder.endEncoding() 


    var result : DataType = 0 


    cmds.commit() 
    cmds.waitUntilCompleted() 
    for elem in results { 
     result += elem 
    } 


    return result 
} 



func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> { 

    let count = data.count 
    let elementsPerSum: Int = Int(sqrt(Double(count))) 

    let device = MTLCreateSystemDefaultDevice()! 
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")! 
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum) 


    var dataCount = CUnsignedInt(count) 
    var elementsPerSumC = CUnsignedInt(elementsPerSum) 
    let resultsCount = (count + elementsPerSum - 1)/elementsPerSum // Number of individual results = count/elementsPerSum (rounded up) 

    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied) 
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized) 
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later 

    let queue = device.newCommandQueue() 
    let cmds = queue.commandBuffer() 
    let encoder = cmds.computeCommandEncoder() 

    encoder.setComputePipelineState(pipeline) 

    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0) 
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1) 
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2) 
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3) 

    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount`/`threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads 
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1)/pipeline.threadExecutionWidth, height: 1, depth: 1) 

    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times) 
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1) 

    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup) 
    encoder.endEncoding() 


    cmds.commit() 
    cmds.waitUntilCompleted() 



    return results 
} 

func sumParallel3(data : Array<DataType>) -> DataType { 

    var results = sumParralel1(data) 

    repeat { 
     results = sumParralel1(Array(results)) 
    } while results.count >= 100 

    var result : DataType = 0 

    for elem in results { 
     result += elem 
    } 


    return result 
} 

func sumParallel4(data : Array<DataType>) -> DataType { 

    let queue = NSOperationQueue() 
    queue.maxConcurrentOperationCount = 4 

    var a0 : DataType = 0 
    var a1 : DataType = 0 
    var a2 : DataType = 0 
    var a3 : DataType = 0 

    let op0 = NSBlockOperation(block : { 

     for i in 0..<(data.count/4) { 
      a0 = a0 + data[i] 
     } 

    }) 

    let op1 = NSBlockOperation(block : { 
     for i in (data.count/4)..<(data.count/2) { 
      a1 = a1 + data[i] 
     } 
    }) 

    let op2 = NSBlockOperation(block : { 
     for i in (data.count/2)..<(3 * data.count/4) { 
      a2 = a2 + data[i] 
     } 
    }) 

    let op3 = NSBlockOperation(block : { 
     for i in (3 * data.count/4)..<(data.count) { 
      a3 = a3 + data[i] 
     } 
    }) 



    queue.addOperation(op0) 
    queue.addOperation(op1) 
    queue.addOperation(op2) 
    queue.addOperation(op3) 

    queue.suspended = false 
    queue.waitUntilAllOperationsAreFinished() 

    let aaa: DataType = a0 + a1 + a2 + a3 

    return aaa 
} 
}

그리고는 다음과 같습니다 쉐이더 있습니다 놀랍게도 기능에

kernel void parsum(const device DataType* data [[ buffer(0) ]], 
       const device uint& dataLength [[ buffer(1) ]], 
       device DataType* sums [[ buffer(2) ]], 
       const device uint& elementsPerSum [[ buffer(3) ]], 

       const uint tgPos [[ threadgroup_position_in_grid ]], 
       const uint tPerTg [[ threads_per_threadgroup ]], 
       const uint tPos [[ thread_position_in_threadgroup ]]) { 

    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread 
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin 
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end 

    for (; dataIndex < endIndex; dataIndex++) 
     sums[resultIndex] += data[dataIndex]; 
}

을 sumParallel4 나는 '이 shouldn 생각하는, 가장 빠른 그러지 마라. 내가 함수 sumParralel과 sumParallel3을 호출 할 때, 함수의 순서를 바꾸더라도 첫 번째 함수는 항상 더 느리다는 것을 알았다. (sumParral3을 먼저 호출하면 속도가 느려집니다. sumParallel3을 호출하면 속도가 느려집니다.).

왜 이런가요? sumParallel3이 sumParallel보다 훨씬 빠르지 않은 이유는 무엇입니까? sumParallel4가 CPU에서 계산 되더라도 왜 가장 빠릅니까?

내 GPU 기능을 posix_memalign으로 어떻게 업데이트 할 수 있습니까? GPU와 CPU간에 메모리를 공유하기 때문에 더 빨리 작동해야한다는 것을 알았지 만, 마녀 배열을이 방법 (데이터 또는 결과)으로 할당해야할지 모르며 posix_memalign을 사용하여 데이터가 함수에 전달 된 매개 변수 인 경우 어떻게 데이터를 할당 할 수 있습니까? ?

출처

2016-07-06 Marko Zadravec

첫 번째 실행이 가장 빠른 이유는 호출에서 전역 개체를 만들기 때문에 두 번째 실행에서 해당 전역 개체를 만들 필요가 없기 때문입니다. – Putz1103

아마도 그럴 수도 있습니다! posix_memalign은 어떨까요? 어떤 생각으로 그것을 사용하는 방법? –

나는 이것에 대해 전혀 경험이 없지만이 사이트는 CPU/GPU 버퍼 공유 및 메모리 정렬과 관련하여 시작하기에 좋은 장소처럼 보였다. http://memkite.com/blog/2014/12/30/example-of-sharing-memory-between-gpu-and-cpu-with-swift-and-metal-for-ios8/ 행운을 비네. – Putz1103

iPhone 6에서 이러한 테스트를 실행하면 Metal 버전이 순진 CPU 합계보다 3 배 느리고 2 배 빠릅니다. 아래에서 설명하는 수정 사항을 사용하면 일관성있게 속도가 빨라졌습니다.

Metal 버전을 실행하는 데 드는 많은 비용은 중요한 것은 아니지만 버퍼의 할당뿐만 아니라 장치 및 계산 파이프 라인 상태의 처음 생성에도 기인 한 것으로 나타났습니다. 이는 일반적으로 응용 프로그램 초기화시 한 번 수행하는 작업이므로 타이밍에 포함시키지 않는 것이 좋습니다.

금속 유효성 검사 레이어와 GPU 프레임 캡처를 사용하는 Xcode를 통해 이러한 테스트를 실행하는 경우 런타임에 상당한 비용이 소요되고 결과가 CPU에 유리하게 비뚤어지게됩니다.

을 백업하는 데 사용할 수있는 메모리를 할당하는 방법은 다음과 같습니다. 트릭은 요청한 메모리가 사실 페이지 단위 (즉, 주소가 getpagesize()의 배수 임)인지 확인하여 데이터를 실제로 저장해야하는 양을 초과하는 메모리 양을 반올림하는 것을 수반 할 수 있습니다.

let dataCount = 1_000_000 
let dataSize = dataCount * strideof(DataType) 
let pageSize = Int(getpagesize()) 
let pageCount = (dataSize + (pageSize - 1))/pageSize 
var dataPointer: UnsafeMutablePointer<Void> = nil 
posix_memalign(&dataPointer, pageSize, pageCount * pageSize) 
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer), 
             count: (pageCount * pageSize)/strideof(DataType)) 

for i in 0..<dataCount { 
    data[i] = 200 
}

이 스위프트의 Array가 자신의 백업 저장소를 할당하기 때문에 오히려 [DataType]보다는 dataUnsafeMutableBufferPointer<DataType>를 만드는 필요로 않습니다. 또한 변경 가능한 버퍼 포인터의 count이 반올림되어 버퍼가 페이지 정렬되도록하기 때문에 조작 할 데이터 항목 수를 전달해야합니다.

실제로이 데이터가있는 MTLBuffer을 생성하려면 newBufferWithBytesNoCopy(_:length:options:deallocator:) API를 사용하십시오.다시 한 번, 제공하는 길이는 페이지 크기의 배수가되는 것이 중요합니다. 그렇지 않으면이 방법은 nil 반환 :

let roundedUpDataSize = strideof(DataType) * data.count 
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)

을 여기에, 우리는 역할 당기를 제공하지 않습니다,하지만 당신은 free()에 버퍼 포인터의 baseAddress를 전달하여, 그것을 사용을 완료 할 때 메모리를 해제해야한다.

출처

2016-07-06 21:36:50 warrenm

iOS에서 배열의 신속한 금속 병렬 합계 계산

답변

관련 문제