@Kametrixom answer을 기반으로 배열의 합계를 병렬 계산하는 테스트 애플리케이션을 만들었습니다.iOS에서 배열의 신속한 금속 병렬 합계 계산
내 테스트 응용 프로그램은 다음과 같습니다
import UIKit
import Metal
class ViewController: UIViewController {
// Data type, has to be the same as in the shader
typealias DataType = CInt
override func viewDidLoad() {
super.viewDidLoad()
let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated
var start, end : UInt64
var result:DataType = 0
start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
for elem in buffer {
result += elem
}
}
end = mach_absolute_time()
print("CPU result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel4(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParralel(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))")
result = 0
start = mach_absolute_time()
result = sumParallel3(data)
end = mach_absolute_time()
print("Metal result: \(result), time: \(Double(end - start)/Double(NSEC_PER_SEC))")
}
func sumParralel(data : Array<DataType>) -> DataType {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1)/elementsPerSum // Number of individual results = count/elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount`/`threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1)/pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
var result : DataType = 0
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
result += elem
}
return result
}
func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {
let count = data.count
let elementsPerSum: Int = Int(sqrt(Double(count)))
let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1)/elementsPerSum // Number of individual results = count/elementsPerSum (rounded up)
let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount`/`threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1)/pipeline.threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
cmds.commit()
cmds.waitUntilCompleted()
return results
}
func sumParallel3(data : Array<DataType>) -> DataType {
var results = sumParralel1(data)
repeat {
results = sumParralel1(Array(results))
} while results.count >= 100
var result : DataType = 0
for elem in results {
result += elem
}
return result
}
func sumParallel4(data : Array<DataType>) -> DataType {
let queue = NSOperationQueue()
queue.maxConcurrentOperationCount = 4
var a0 : DataType = 0
var a1 : DataType = 0
var a2 : DataType = 0
var a3 : DataType = 0
let op0 = NSBlockOperation(block : {
for i in 0..<(data.count/4) {
a0 = a0 + data[i]
}
})
let op1 = NSBlockOperation(block : {
for i in (data.count/4)..<(data.count/2) {
a1 = a1 + data[i]
}
})
let op2 = NSBlockOperation(block : {
for i in (data.count/2)..<(3 * data.count/4) {
a2 = a2 + data[i]
}
})
let op3 = NSBlockOperation(block : {
for i in (3 * data.count/4)..<(data.count) {
a3 = a3 + data[i]
}
})
queue.addOperation(op0)
queue.addOperation(op1)
queue.addOperation(op2)
queue.addOperation(op3)
queue.suspended = false
queue.waitUntilAllOperationsAreFinished()
let aaa: DataType = a0 + a1 + a2 + a3
return aaa
}
}
그리고는 다음과 같습니다 쉐이더 있습니다 놀랍게도 기능에
kernel void parsum(const device DataType* data [[ buffer(0) ]],
const device uint& dataLength [[ buffer(1) ]],
device DataType* sums [[ buffer(2) ]],
const device uint& elementsPerSum [[ buffer(3) ]],
const uint tgPos [[ threadgroup_position_in_grid ]],
const uint tPerTg [[ threads_per_threadgroup ]],
const uint tPos [[ thread_position_in_threadgroup ]]) {
uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
for (; dataIndex < endIndex; dataIndex++)
sums[resultIndex] += data[dataIndex];
}
을 sumParallel4
나는 '이 shouldn 생각하는, 가장 빠른 그러지 마라. 내가 함수 sumParralel
과 sumParallel3
을 호출 할 때, 함수의 순서를 바꾸더라도 첫 번째 함수는 항상 더 느리다는 것을 알았다. (sumParral3을 먼저 호출하면 속도가 느려집니다. sumParallel3을 호출하면 속도가 느려집니다.).
왜 이런가요? sumParallel3이 sumParallel보다 훨씬 빠르지 않은 이유는 무엇입니까? sumParallel4가 CPU에서 계산 되더라도 왜 가장 빠릅니까?
내 GPU 기능을 posix_memalign
으로 어떻게 업데이트 할 수 있습니까? GPU와 CPU간에 메모리를 공유하기 때문에 더 빨리 작동해야한다는 것을 알았지 만, 마녀 배열을이 방법 (데이터 또는 결과)으로 할당해야할지 모르며 posix_memalign을 사용하여 데이터가 함수에 전달 된 매개 변수 인 경우 어떻게 데이터를 할당 할 수 있습니까? ?
첫 번째 실행이 가장 빠른 이유는 호출에서 전역 개체를 만들기 때문에 두 번째 실행에서 해당 전역 개체를 만들 필요가 없기 때문입니다. – Putz1103
아마도 그럴 수도 있습니다! posix_memalign은 어떨까요? 어떤 생각으로 그것을 사용하는 방법? –
나는 이것에 대해 전혀 경험이 없지만이 사이트는 CPU/GPU 버퍼 공유 및 메모리 정렬과 관련하여 시작하기에 좋은 장소처럼 보였다. http://memkite.com/blog/2014/12/30/example-of-sharing-memory-between-gpu-and-cpu-with-swift-and-metal-for-ios8/ 행운을 비네. – Putz1103