昨晚看SpinalHDL的Issues,其中有一個關(guān)于性能提升的case 吸引到了我,嘗試實驗到深夜,測試下在SpinalHDL以及cocotb下的性能優(yōu)化手段。
SpinalHDL Simulation性能提升測試
無論是SpinalHDL還是cocotb,其在仿真方面所采用的思路是一樣的。在SpinalHDL的那個Issue里,Dolu主要想做的是盡可能避免在信號的賦值和讀取上的冗余代碼。以toInt為例,其會調(diào)用getInt函數(shù):
private defgetInt(bt: BaseType):Int = { if(bt.getBitsWidth == 0) return0 val manager = SimManagerContext.current.manager val signal = btToSignal(manager, bt) manager.getInt(signal) }
而Dolu的思路則是沒有必要每次都重新尋找manager、signal這些信息,畢竟對于一個信號而言這兩個值是不變的。而是提前準備好對于這種頻繁使用的接口則能夠盡可能降低不必要的開銷。
這里做了個測試,testSimNormal采用普通的API調(diào)用形式訪問信號,
testSimSpeed則采用加速后的方式進行訪問一個512bit信號位寬。兩者測試訪問1億次信號值所消耗的時間,測試結(jié)果如下:
testSimNormal | 26998ms |
testSimSpeed | 24462ms |
可以看出,還是能夠加速仿真速度的。
考慮到在仿真過程中無非是信號的驅(qū)動和讀取,那么這里應(yīng)該是都適用的,遂以相同的DUT相同的Case嘗試做了如下測試:
testSim1:采用SpinalHDL原生API進行仿真測試
testSim2:將信號的讀取和賦值均改為優(yōu)化后的方式
testSim3:在testSim2的基礎(chǔ)上將時鐘,復(fù)位驅(qū)動也改為優(yōu)化后的方式
testSim4:在testSim2的基礎(chǔ)上將waitSampling修改為優(yōu)化后的方式
testSim5:在testSim3的基礎(chǔ)上將時鐘,將waitSampling修改為優(yōu)化后的方式
測試結(jié)果如下:
testSim1 | 6469.675 ms |
testSim2 | 6196.007 ms |
testSim3 | 6196.007 ms |
testSim4 | 6066.035 ms |
testSim5 | 6076.121 ms |
每個測試里面都是跑了500000周期,可以看到,對于降低延遲還是有效果的。對于更大的case,也許會有更有效的效果。
附上完整的測試代碼(由于電腦較差,諸君可自行測試):
import spinal.core._ import spinal.lib._ import spinal.sim.{Signal, SimManager, SimManagerContext} import scala.collection.mutable.ArrayBuffer case class dut() extends Component { val io = new Bundle { val data_in = slave Flow (UInt(512 bits)) val data_out = master Flow (UInt(512 bits)) } noIoPrefix() io.data_out << io.data_in.translateWith(io.data_in.payload + 1).stage() } import spinal.core.sim._ object testSimNormal extends App { ??SimConfig.withFstWave.compile(dut()).doSim { dut => dut.io.data_in.valid #= false dut.clockDomain.forkStimulus(10) dut.clockDomain.waitSampling(10) val startTime = System.currentTimeMillis() for (index <- 0 until 100000000) { ??????dut.io.data_out.payload.toBigInt ????} ????val endTime = System.currentTimeMillis() ????val totalTime = endTime - startTime ??? println("代碼運行時間:" + totalTime + "毫秒") ??} } object testSimSpeed extends App { ??implicit class SimBitVectorPimper(bt: BaseType) { ????class SimProxy(bt: BaseType) { ??????val manager = SimManagerContext.current.manager ??????val signal = manager.raw.userData.asInstanceOf[ArrayBuffer[Signal]](bt.algoInt) ??????val alwaysZero = bt.getBitsWidth == 0 ??????def getLong = manager.getLong(signal) ??????def getBoolean = manager.getLong(signal) != 0 ??????def getBigInt = manager.getBigInt(signal) ??????def assignBoolean(value: Boolean) = manager.setLong(signal, value.toInt) ??????def setLong(value: Long) = manager.setLong(signal, value) ??????def assignBigInt(value: BigInt) = manager.setBigInt(signal, value) ????} ????def simProxy() = new SimProxy(bt) ??} ??SimConfig.withFstWave.compile(dut()).doSim { dut => dut.io.data_in.valid #= false dut.clockDomain.forkStimulus(10) dut.clockDomain.waitSampling(10) val dataOutHdl = dut.io.data_out.payload.simProxy() val startTime = System.currentTimeMillis() for (index <- 0 until 100000000) { ??????dataOutHdl.getBigInt ????} ????val endTime = System.currentTimeMillis() ????val totalTime = endTime - startTime ??? println("代碼運行時間:" + totalTime + "毫秒") ??} } object SimExtend { ??implicit class SimBitVectorPimper(bt: BaseType) { ????class SimProxy(bt: BaseType) { ??????val manager = SimManagerContext.current.manager ??????val signal = manager.raw.userData.asInstanceOf[ArrayBuffer[Signal]](bt.algoInt) ??????val alwaysZero = bt.getBitsWidth == 0 ??????def getLong = manager.getLong(signal) ??????def getBoolean = manager.getLong(signal) != 0 ??????def getBigInt = manager.getBigInt(signal) ??????def assignBoolean(value: Boolean) = manager.setLong(signal, value.toInt) ??????def setLong(value: Long) = manager.setLong(signal, value) ??????def assignBigInt(value: BigInt) = manager.setBigInt(signal, value) ????} ????def simProxy() = new SimProxy(bt) ??} ??def getBool(manager: SimManager, who: Bool): Bool = { ????val component = who.component ????if ((who.isInput || who.isOutput) && component != null && component.parent == null) { ??????who ????} else { ??????manager.userData.asInstanceOf[Component].pulledDataCache.getOrElse(who, null).asInstanceOf[Bool] ????} ??} } object testSim extends App { ??val dutCompiled = SimConfig.withFstWave.compile(dut()) ??/** ***************************************************************************************** ???* testSim1 ???* *****************************************************************************************?*/ ??dutCompiled.doSim { dut => dut.io.data_in.valid #= false dut.clockDomain.forkStimulus(10) dut.clockDomain.waitSampling(10) var sum = BigInt(0) for (index <- 0 until 500000) { ??????dut.clockDomain.waitSampling() ??????if (dut.io.data_out.valid.toBoolean) { ????????sum = sum + dut.io.data_out.payload.toBigInt ??????} ??????dut.io.data_in.valid #= true ??????dut.io.data_in.payload #= BigInt(index) ????} ??} ??/** ***************************************************************************************** ???* testSim2 ???* *****************************************************************************************?*/ ??dutCompiled.doSim { dut => import SimExtend._ val dataInValidHdl = dut.io.data_in.valid.simProxy() val dataInDataHdl = dut.io.data_in.payload.simProxy() val dataOutValidHdl = dut.io.data_out.valid.simProxy() val dataOutDataHdl = dut.io.data_out.payload.simProxy() dataInValidHdl.assignBoolean(false) dut.clockDomain.forkStimulus(10) dut.clockDomain.waitSampling(10) var sum = BigInt(0) for (index <- 0 until 500000) { ??????dut.clockDomain.waitSampling() ??????if (dataOutValidHdl.getBoolean) { ????????sum = sum + dataOutDataHdl.getBigInt ??????} ??????dataInValidHdl.assignBoolean(true) ??????dataInDataHdl.assignBigInt(index) ????} ??} ??/** ***************************************************************************************** ???* testSim3 ???* *****************************************************************************************?*/ ??dutCompiled.doSim { dut => import SimExtend._ val dataInValidHdl = dut.io.data_in.valid.simProxy() val dataInDataHdl = dut.io.data_in.payload.simProxy() val dataOutValidHdl = dut.io.data_out.valid.simProxy() val dataOutDataHdl = dut.io.data_out.payload.simProxy() val clock = getBool(SimManagerContext.current.manager, dut.clockDomain.clock).simProxy() val reset = getBool(SimManagerContext.current.manager, dut.clockDomain.reset).simProxy() dataInValidHdl.assignBoolean(false) //clock generation clock.assignBoolean(false) reset.assignBoolean(true) sleep(10 * 16) reset.assignBoolean(false) fork { var value = false def t: Unit = { value = !value clock.assignBoolean(value) delayed(5)(t) } t } dut.clockDomain.waitSampling(10) var sum = BigInt(0) for (index <- 0 until 500000) { ??????dut.clockDomain.waitSampling() ??????if (dataOutValidHdl.getBoolean) { ????????sum = sum + dataOutDataHdl.getBigInt ??????} ??????dataInValidHdl.assignBoolean(true) ??????dataInDataHdl.assignBigInt(index) ????} ??} ??/** ***************************************************************************************** ???* testSim4 ???* *****************************************************************************************?*/ ??dutCompiled.doSim { dut => import SimExtend._ val dataInValidHdl = dut.io.data_in.valid.simProxy() val dataInDataHdl = dut.io.data_in.payload.simProxy() val dataOutValidHdl = dut.io.data_out.valid.simProxy() val dataOutDataHdl = dut.io.data_out.payload.simProxy() val clock = getBool(SimManagerContext.current.manager, dut.clockDomain.clock).simProxy() val reset = getBool(SimManagerContext.current.manager, dut.clockDomain.reset).simProxy() var rising = false var last = false dataInValidHdl.assignBoolean(false) //clock generation dut.clockDomain.forkStimulus(10) dut.clockDomain.waitSampling(10) var sum = BigInt(0) for (index <- 0 until 500000) { ??????waitUntil { ????????rising = false ????????val current = clock.getBoolean ????????if ((!last) && current) { ??????????rising = true ????????} ????????last = current ????????rising ??????} ??????if (dataOutValidHdl.getBoolean) { ????????sum = sum + dataOutDataHdl.getBigInt ??????} ??????dataInValidHdl.assignBoolean(true) ??????dataInDataHdl.assignBigInt(index) ????} ??} ??/** ***************************************************************************************** ???* testSim5 ???* *****************************************************************************************?*/ ??dutCompiled.doSim { dut => import SimExtend._ val dataInValidHdl = dut.io.data_in.valid.simProxy() val dataInDataHdl = dut.io.data_in.payload.simProxy() val dataOutValidHdl = dut.io.data_out.valid.simProxy() val dataOutDataHdl = dut.io.data_out.payload.simProxy() val clock = getBool(SimManagerContext.current.manager, dut.clockDomain.clock).simProxy() val reset = getBool(SimManagerContext.current.manager, dut.clockDomain.reset).simProxy() var rising = false var last = false dataInValidHdl.assignBoolean(false) //clock generation clock.assignBoolean(false) reset.assignBoolean(true) sleep(10 * 16) reset.assignBoolean(false) fork { var value = false def t: Unit = { value = !value clock.assignBoolean(value) delayed(5)(t) } t } dut.clockDomain.waitSampling(10) var sum = BigInt(0) for (index <- 0 until 500000) { ??????waitUntil { ????????rising = false ????????val current = clock.getBoolean ????????if ((!last) && current) { ??????????rising = true ????????} ????????last = current ????????rising ??????} ??????if (dataOutValidHdl.getBoolean) { ????????sum = sum + dataOutDataHdl.getBigInt ??????} ??????dataInValidHdl.assignBoolean(true) ??????dataInDataHdl.assignBigInt(index) ????} ??} }cocotb性能優(yōu)化
cocotb的仿真速度一直我是持保留意見的。在SpinalHDL里面做完嘗試,最近工作里用到的cocotb較多,就嘗試看下能否應(yīng)用到cocotb中??戳讼耤ocotb中的信號讀寫封裝背后的調(diào)用,其做了太多的封裝和調(diào)用。遂采用了相同的DUT做了同樣的測試。首先是做優(yōu)化前后的一百萬次的方式測試(跑一億次真的太久了)
testSimNormal | 3.58s |
testSimSpeed | 1.09s |
可以看到,這里有明顯的性能提升。
再來構(gòu)建下面的六個case:
testCase0 :采用cocotb提供的API接口進行數(shù)據(jù)讀寫訪問
testCase1: 僅將信號讀更改為底層接口直接調(diào)用形式進行訪問
testCase2:將信號讀,信號寫均改為底層接口直接調(diào)用形式進行訪問
testCase3:在testCase2的基礎(chǔ)上將信號接口提前生成好而不是使用時例化
testCase4:在testCase4的基礎(chǔ)上將時鐘生成修改為底層接口直接調(diào)用形式
testCase5: 在testCase0基礎(chǔ)上,僅將時鐘生成修改為底層接口直接調(diào)用的形式
測試結(jié)果如下:
每個Case中均做100000次周期測試??梢钥吹?,與原生Case仿真相比,testCase5能提升1.7倍多,而testCase4則有4.8倍的性能提升。由此可見,cocotb中對于信號讀寫的封裝由于做了太多安全和邊界的處理導(dǎo)致這種在仿真中經(jīng)常使用的函數(shù)帶來挺大的開銷。
由于Verilator好像不支持時鐘下沉,如果將時鐘的驅(qū)動給放到Verilog里面,也許還會有進一步的性能提升。
本人對于底層的東西不甚了解,單純從仿真速度上,cocotb相較于SpinalHDL還是有較大的差距(《既生瑜何生亮——SpinalHDL VS Cocotb》),有一點有意思的額是在SpinalHDL里面修改時鐘生成的方式并未有太大的性能提升,而在cocotb里確有明顯改善,諸君有興趣可以自行研究。
附上源碼,感興趣的小伙伴可以自行測試:
DUT:
// Generator : SpinalHDL v1.8.0b git head : 761a30e521263983ddf14de3592f7a9f38bf0589 // Component : simSpeedUpTest `timescale 1ns/1ps module dut ( input data_in_valid, output reg data_out_valid, input [511:0] data_in, output reg [511:0] data_out, input clk, input reset ); always @(posedge clk ) begin if(reset) begin data_out <= 'd0; ??????data_out_valid<='d0; ????end else?begin ??????data_out <= data_in+1; ??????data_out_valid<= data_in_valid; ????end ??end endmodule
TestBench:
import cocotb fromcocotb_bus.drivers import BusDriver fromcocotb.clock import Clock fromcocotb.triggers import ClockCycles,RisingEdge,Timer,ReadOnly fromcocotb.handle import * @cocotb.test(skip=False) asyncdef testCaseNormal(dut): targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle dataInvalidSignal=dut.data_in_valid._handle dataInDataSignal=dut.data_in._handle cocotb.start_soon(generateClk(dut.clk)) dataInDataSignal.set_signal_val_binstr(0,bin(0)[2:]) dataInvalidSignal.set_signal_val_int(0,0) dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 forindex inrange(1000000): dut.data_out_valid.value awaitClockCycles(dut.clk,10) @cocotb.test(skip=False) asyncdef testCaseSpeed(dut): targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle dataInvalidSignal=dut.data_in_valid._handle dataInDataSignal=dut.data_in._handle cocotb.start_soon(generateClk(dut.clk)) dataInDataSignal.set_signal_val_binstr(0,bin(0)[2:]) dataInvalidSignal.set_signal_val_int(0,0) dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 forindex inrange(1000000): targetDataSignal.get_signal_val_binstr() awaitClockCycles(dut.clk,10) @cocotb.test(skip=False) asyncdef testCase0(dut): cocotb.start_soon(Clock(dut.clk,10,'ns').start()) dut.reset.value=1 dut.data_in.value= 0 dut.data_in_valid.value= 0 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 targetSignal=dut.data_out._handle forindex inrange(100000): awaitRisingEdge(dut.clk) ifint(dut.data_out_valid.value) == 1: sum+= dut.data_out.value dut.data_in_valid.value= 1 dut.data_in.value= index awaitClockCycles(dut.clk,100000) @cocotb.test(skip=False) asyncdef testCase1(dut): cocotb.start_soon(Clock(dut.clk,10,'ns').start()) dut.data_in.value= 0 dut.data_in_valid.value= 0 dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle forindex inrange(100000): awaitRisingEdge(dut.clk) iftargetValueSignal.get_signal_val_long()==1: sum+= int(targetDataSignal.get_signal_val_binstr(),2) dut.data_in_valid.value= 1 dut.data_in.value= index awaitClockCycles(dut.clk,10) @cocotb.test(skip=False) asyncdef testCase2(dut): cocotb.start_soon(Clock(dut.clk,10,'ns').start()) dut.data_in.value= 0 dut.data_in_valid.value= 0 dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle forindex inrange(100000): awaitRisingEdge(dut.clk) iftargetValueSignal.get_signal_val_long()==1: sum+= int(targetDataSignal.get_signal_val_binstr(),2) dut.data_in._handle.set_signal_val_binstr(0,bin(index)[2:]) dut.data_in_valid._handle.set_signal_val_int(0,1) awaitClockCycles(dut.clk,10) @cocotb.test(skip=False) asyncdef testCase3(dut): cocotb.start_soon(Clock(dut.clk,10,'ns').start()) dut.data_in.value= 0 dut.data_in_valid.value= 0 dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle dataInvalidSignal=dut.data_in_valid._handle dataInDataSignal=dut.data_in._handle forindex inrange(100000): awaitRisingEdge(dut.clk) iftargetValueSignal.get_signal_val_long()==1: sum+= int(targetDataSignal.get_signal_val_binstr(),2) dataInDataSignal.set_signal_val_binstr(0,bin(index)[2:]) dataInvalidSignal.set_signal_val_int(0,1) awaitClockCycles(dut.clk,10) asyncdef generateClk(clk): clk._handle.set_signal_val_int(1,0) whileTrue: awaitTimer(5, units="ns") clk._handle.set_signal_val_int(0,0) awaitTimer(5, units="ns") clk._handle.set_signal_val_int(0,1) @cocotb.test(skip=False) asyncdef testCase4(dut): targetDataSignal=dut.data_out._handle targetValueSignal=dut.data_out_valid._handle dataInvalidSignal=dut.data_in_valid._handle dataInDataSignal=dut.data_in._handle cocotb.start_soon(generateClk(dut.clk)) dataInDataSignal.set_signal_val_binstr(0,bin(0)[2:]) dataInvalidSignal.set_signal_val_int(0,0) dut.reset.value=1 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 forindex inrange(100000): awaitRisingEdge(dut.clk) iftargetValueSignal.get_signal_val_long()==1: sum+= int(targetDataSignal.get_signal_val_binstr(),2) dataInDataSignal.set_signal_val_binstr(0,bin(index)[2:]) dataInvalidSignal.set_signal_val_int(0,1) awaitClockCycles(dut.clk,10) @cocotb.test(skip=False) asyncdef testCase5(dut): cocotb.start_soon(generateClk(dut.clk)) dut.reset.value=1 dut.data_in.value= 0 dut.data_in_valid.value= 0 awaitClockCycles(dut.clk,10) dut.reset.value=0 awaitClockCycles(dut.clk,10) sum=0 targetSignal=dut.data_out._handle forindex inrange(100000): awaitRisingEdge(dut.clk) ifint(dut.data_out_valid.value) == 1: sum+= dut.data_out.value dut.data_in_valid.value= 1 dut.data_in.value= index awaitClockCycles(dut.clk,100000)審核編輯:湯梓紅
-
仿真
+關(guān)注
關(guān)注
50文章
4082瀏覽量
133613 -
信號
+關(guān)注
關(guān)注
11文章
2791瀏覽量
76771 -
代碼
+關(guān)注
關(guān)注
30文章
4788瀏覽量
68616 -
Simulation
+關(guān)注
關(guān)注
0文章
13瀏覽量
8170
原文標題:給仿真加點速
文章出處:【微信號:Spinal FPGA,微信公眾號:Spinal FPGA】歡迎添加關(guān)注!文章轉(zhuǎn)載請注明出處。
發(fā)布評論請先 登錄
相關(guān)推薦
評論