Hi,
I was wondering if there is a way to quantify the core to core
communication latency with gem5? If so, can anyone please provide some
guidelines to extract that information from simulation results?
Thanks,
Kazi
Hello Kazi,
If by core to core communication latency, you are referring to the latency
imposed by read sharing a cache block, you can use TrafficGenerator from
gem5 stdlib. The example below is most probably not the best way to do
this, but I could successfully measure the latency of moving the cache
block with address "addr' from core "src" to core "dst". After simulation
you should search for "generator.avgReadLatency". Note: This script
includes specific details about my use case (e.g. you should replace
CohrenetMeshNetwork with your cache hierarchy).
Best,
import argparse
import m5
from m5.debug import flags
from m5.objects import Root, DDR4_2400_8x8
from gem5.components.boards.test_board import TestBoard
from gem5.components.memory.memory import ChanneledMemory
from gem5.components.processors.traffic_generator import TrafficGenerator
from components.cmn import CoherentMeshNetwork
def get_inputs():
parser = argparse.ArgumentParser()
parser.add_argument(
"num_cores", type=int, help="Number of generator cores to simulate."
)
parser.add_argument(
"addr",
type=int,
help="Address to move",
)
parser.add_argument("src_id", type=int, help="Number of source core.")
parser.add_argument("dst_id", type=int, help="Number of destination core.")
args = parser.parse_args()
assert args.src_id < args.num_cores
assert args.dst_id < args.num_cores
assert args.src_id != args.dst_id
return args.num_cores, args.addr, args.src_id, args.dst_id
def generate_config_files(num_cores, addr, src_id, dst_id):
ret = []
for i in range(num_cores):
with open(f"/tmp/core_{i}.cfg", "w") as config:
lines = [
f"STATE 0 0 LINEAR 100 {i*2048} {(i+1)*2048} 64 500 500 2048\n",
]
transition_lines = ["INIT 0\n"]
if not i in [src_id, dst_id]:
lines += [f"STATE 1 10000000 IDLE\n", f"STATE 2 0 EXIT\n"]
transition_lines += [
"TRANSITION 0 1 1\n",
"TRANSITION 1 2 1\n",
"TRANSITION 2 2 1\n",
]
elif i == src_id:
lines += [
f"STATE 1 0 LINEAR 100 {addr} {addr} 64 500 500 64\n",
f"STATE 2 0 EXIT\n",
f"STATE 3 10000000 IDLE\n",
f"STATE 4 0 EXIT\n",
]
transition_lines += [
"TRANSITION 0 1 1\n",
"TRANSITION 1 2 1\n",
"TRANSITION 2 3 1\n",
"TRANSITION 3 4 1\n",
"TRANSITION 4 4 1\n",
]
elif i == dst_id:
lines += [
f"STATE 1 500000 IDLE\n",
f"STATE 2 0 EXIT\n",
f"STATE 3 0 LINEAR 100 {addr} {addr} 64 500 500 64\n",
f"STATE 4 500000 IDLE\n",
f"STATE 5 0 EXIT\n",
f"STATE 6 10000000 IDLE\n",
]
transition_lines += [
"TRANSITION 0 1 1\n",
"TRANSITION 1 2 1\n",
"TRANSITION 2 3 1\n",
"TRANSITION 3 4 1\n",
"TRANSITION 4 5 1\n",
"TRANSITION 5 6 1\n",
"TRANSITION 6 6 1\n",
]
else:
raise ValueError
config.writelines(lines + transition_lines)
ret.append(f"/tmp/core_{i}.cfg")
return ret
def MultiChannelDDR4(num_channels):
return ChanneledMemory(DDR4_2400_8x8, num_channels, 128, size="16GiB")
if __name__ == "__m5_main__":
num_cores, addr, src_id, dst_id = get_inputs()
generator = TrafficGenerator(
generate_config_files(num_cores, addr, src_id, dst_id)
)
cache = CoherentMeshNetwork()
memory = MultiChannelDDR4(8)
board = TestBoard(
clk_freq="4GHz",
generator=generator,
cache_hierarchy=cache,
memory=memory,
)
root = Root(full_system=False, system=board)
board._pre_instantiate()
m5.instantiate()
generator.start_traffic()
print("Beginning simulation!")
exit_events_countered = 0
while True:
exit_event = m5.simulate()
exit_events_countered += 1
print(
f"Exiting @ tick {m5.curTick()} because {exit_event.getCause()}."
)
print(f"Received {exit_events_countered} exit events.")
if exit_events_countered == 1:
print("Source core done reading.")
if exit_events_countered == 2:
print("Resetting stats.")
m5.stats.reset()
if exit_events_countered == 3:
print("Exiting while loop.")
break
print("Simulation over.")