I run into this when restoring a simpoint checkpoint for the second Spec2017 perlbench workload.
Likewise could keep numEntries at 4096 but double associativity to 16.
from m5.citations import add_citation
from m5.defines import buildEnv
from m5.objects.BaseCPU import BaseCPU
# from m5.objects.O3Checker import O3Checker
from m5.objects.BranchPredictor import *
from m5.objects.FUPool import *
from m5.objects.IndexingPolicies import *
from m5.objects.IQUnit import *
from m5.objects.ReplacementPolicies import *
from m5.objects.SMT import *
from m5.params import *
from m5.proxy import *
from m5.SimObject import *
class BaseO3CPU(BaseCPU):
type = "BaseO3CPU"
cxx_class = "gem5::o3::CPU"
cxx_header = "cpu/o3/dyn_inst.hh"
@classmethod
def memory_mode(cls):
return "timing"
@classmethod
def require_caches(cls):
return True
@classmethod
def support_take_over(cls):
return True
activity = Param.Unsigned(0, "Initial count")
cacheStorePorts = Param.Unsigned(
700, "Cache Ports. Constrains stores only."
)
cacheLoadPorts = Param.Unsigned(700, "Cache Ports. Constrains loads only.")
# Backward pipeline delays
fetchToBacDelay = Param.Cycles(1, "Fetch to Branch address calc. delay")
decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
renameToFetchDelay = Param.Cycles(1, "Rename to fetch delay")
iewToFetchDelay = Param.Cycles(1, "Issue/Execute/Writeback to fetch delay")
commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
fetchWidth = Param.Unsigned(6, "Fetch width")
fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
fetchQueueSize = Param.Unsigned(
128, "Fetch queue size in micro-ops per-thread"
)
renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
iewToDecodeDelay = Param.Cycles(
1, "Issue/Execute/Writeback to decode delay"
)
commitToDecodeDelay = Param.Cycles(1, "Commit to decode delay")
# Forward pipeline delays
bacToFetchDelay = Param.Cycles(1, "Branch address calc. to fetch delay")
fetchToDecodeDelay = Param.Cycles(1, "Fetch to decode delay")
decodeWidth = Param.Unsigned(6, "Decode width")
iewToRenameDelay = Param.Cycles(
1, "Issue/Execute/Writeback to rename delay"
)
commitToRenameDelay = Param.Cycles(1, "Commit to rename delay")
decodeToRenameDelay = Param.Cycles(1, "Decode to rename delay")
renameWidth = Param.Unsigned(6, "Rename width")
commitToIEWDelay = Param.Cycles(
1, "Commit to Issue/Execute/Writeback delay"
)
renameToIEWDelay = Param.Cycles(
2, "Rename to Issue/Execute/Writeback delay"
)
issueToExecuteDelay = Param.Cycles(
1, "Issue to execute delay (internal to the IEW stage)"
)
dispatchWidth = Param.Unsigned(6, "Dispatch width")
issueWidth = Param.Unsigned(12, "Issue width")
wbWidth = Param.Unsigned(12, "Writeback width")
iewToCommitDelay = Param.Cycles(
1, "Issue/Execute/Writeback to commit delay"
)
renameToROBDelay = Param.Cycles(1, "Rename to reorder buffer delay")
commitWidth = Param.Unsigned(6, "Commit width")
squashWidth = OptionalParam.Unsigned(
"Squash width. If unspecified all instructions are "
"squashed instantly within one cycle.",
)
trapLatency = Param.Cycles(13, "Trap latency")
fetchTrapLatency = Param.Cycles(1, "Fetch trap latency")
backComSize = Param.Unsigned(
12, "Time buffer size for backwards communication"
)
forwardComSize = Param.Unsigned(
12, "Time buffer size for forward communication"
)
LQEntries = Param.Unsigned(192, "Number of load queue entries")
SQEntries = Param.Unsigned(114, "Number of store queue entries")
LSQDepCheckShift = Param.Unsigned(
4, "Number of places to shift addr before check"
)
LSQForwardingLatency = Param.Unsigned(
4, "Latency of store forwarding in cycles"
)
LSQCheckLoads = Param.Bool(
True,
"Should dependency violations be checked for "
"loads & stores or just stores",
)
store_set_clear_period = Param.Unsigned(
128*244,
"Number of load/store insts before the dep predictor "
"should be invalidated",
)
store_set_clear_thres = Param.Unsigned(128*224, "Number of cycles before store sets should be invalided (XiangShan variant)")
LFSTSize = Param.Unsigned(4096, "Last fetched store table size")
LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain (XiangShan variant)")
SSITSize = Param.MemorySize("8192", "Store set ID table size")
SSITAssoc = Param.Unsigned(1, "SSIT table associativity")
SSITReplPolicy = Param.BaseReplacementPolicy(
LRURP(), "SSIT replacement policy"
)
SSITIndexingPolicy = Param.BaseIndexingPolicy(
SetAssociative(
size=Parent.SSITSize * 4,
assoc=Parent.SSITAssoc,
entry_size=4,
),
"SSIT indexing policy",
)
phast_num_rows = Param.Unsigned(128, "Number of rows per table")
phast_associativity = Param.Unsigned(4, "Number of entries per row")
phast_tag_bits = Param.Unsigned(16, "Size of entry tags")
phast_max_counter = Param.Unsigned(16, "Max confidence counter value")
mdpReadPorts = Param.Unsigned(4, "Max MDP lookups per cycle")
storesDoLookups = Param.Bool(False, "Whether stores query the MDP")
numRobs = Param.Unsigned(1, "Number of Reorder Buffers")
numPhysIntRegs = Param.Unsigned(
512, "Number of physical integer registers"
)
numPhysFloatRegs = Param.Unsigned(
512, "Number of physical floating point registers"
)
numPhysVecRegs = Param.Unsigned(512, "Number of physical vector registers")
numPhysVecPredRegs = Param.Unsigned(
64, "Number of physical predicate registers"
)
numPhysMatRegs = Param.Unsigned(8, "Number of physical matrix registers")
# most ISAs don't use condition-code regs, so default is 0
numPhysCCRegs = Param.Unsigned(0, "Number of physical cc registers")
numROBEntries = Param.Unsigned(512, "Number of reorder buffer entries")
instQueues = VectorParam.IQUnit(IQUnit(), "Vector of IQs")
smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching Threads")
smtFetchPolicy = Param.SMTFetchPolicy("RoundRobin", "SMT Fetch policy")
smtLSQPolicy = Param.SMTQueuePolicy(
"Partitioned", "SMT LSQ Sharing Policy"
)
smtLSQThreshold = Param.Int(100, "SMT LSQ Threshold Sharing Parameter")
smtIQPolicy = Param.SMTQueuePolicy("Partitioned", "SMT IQ Sharing Policy")
smtIQThreshold = Param.Int(100, "SMT IQ Threshold Sharing Parameter")
smtROBPolicy = Param.SMTQueuePolicy(
"Partitioned", "SMT ROB Sharing Policy"
)
smtROBThreshold = Param.Int(100, "SMT ROB Threshold Sharing Parameter")
smtCommitPolicy = Param.CommitPolicy("RoundRobin", "SMT Commit Policy")
branchPred = Param.BranchPredictor(
BranchPredictor(conditionalBranchPred=TournamentBP(),
btb=SimpleBTB(),
numThreads=Parent.numThreads,
),
"Branch Predictor",
)
needsTSO = Param.Bool(False, "Enable TSO Memory model")
recvRespThrottling = Param.Bool(
False, "Enable load receive response throttling in the LSQ"
)
recvRespMaxCachelines = Param.Unsigned(
1,
"Maximum number of different receive response cachelines per cycle",
)
recvRespBufferSize = Param.Unsigned(
64, "Maximum number of receive response bytes per cycle"
)
## Parameters for decoupled front-end
decoupledFrontEnd = Param.Bool(False, "Enables the decoupled front-end")
numFTQEntries = Param.Unsigned(
8,
"Number of entries in the Fetch target queue. (only used for "
"decoupled front-end)",
)
minInstSize = Param.Unsigned(
1,
"Minimum instruction size (bytes). Determines the granularity "
"of the instruction minimum search width per cycle",
)
fetchTargetWidth = Param.Unsigned(
64,
"Max width (bytes) of Fetch target. "
"Determines the maximum search width per cycle",
)
maxFTPerCycle = Param.Unsigned(4, "Max number of FT created per cycle")
maxTakenPredPerCycle = Param.Unsigned(
1, "Max number of taken predictions per cycle"
)
add_citation(
BaseO3CPU,
"""@inproceedings{10.1145/3613424.3614258,
author = {Schall, David and
Sandberg, Andreas and
Grot, Boris},
title = {Warming Up a Cold Front-End with Ignite},
year = {2023},
publisher = {Association for Computing Machinery},
address = {Toronto, ON, Canada},
doi = {10.1145/3613424.3614258},
booktitle = {Proceedings of the 56th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO '23)},
series = {MICRO'23}
}
""",
)
I've attached the binary the checkpoint was generated from (I appended .txt to make it a supported file type). The simpoint checkpoint starts at instruction 100000000 (90000000 with warmup).
perlbench_s_peak.mytest-64.txt
Describe the bug
When the size of SimpleBTB is configured too large (>8-way with 4096 entries) with a larger O3 model, Gem5 can occasionally crash with 'malloc(): bad size (unsorted)'.
I run into this when restoring a simpoint checkpoint for the second Spec2017 perlbench workload.
Affects version
Stable
To Reproduce
In BranchPredictor.py:
Likewise could keep numEntries at 4096 but double associativity to 16.
BaseO3CPU.py:
Simulation command:
/path/to/gem5//build/ARM/gem5.fast /path/to/gem5/configs/deprecated/example/se.py --cpu-type=DerivO3CPU --caches --l2cache --mem-type=DDR4_2400_8x8 --restore-simpoint-checkpoint -r 1 --checkpoint-dir /path/to/checkpoints --restore-with-cpu=AtomicSimpleCPU --mem-size=16GiB -c ./perlbench_s_peak.mytest-64 '--options=-I./lib diffmail.pl 4 800 10 17 19 300' --l1d_size=64KiB --l1i_size=32KiB --l2_size=1MB --l1d_assoc=8 --l1i_assoc=8 --l2_assoc=16 --l1i-hwp-type=StridePrefetcher --l1d-hwp-type=StridePrefetcher --l2-hwp-type=StridePrefetcher -P 'system.switch_cpus[:].LSQDepCheckShift=0'Gem5 compiled with:
scons build/ARM/gem5.fast --ignore-style --linker=goldHost Operating System
Ubuntu 24.04
Host ISA
X86
Compiler used
GCC 13
Additional information
I've attached the binary the checkpoint was generated from (I appended .txt to make it a supported file type). The simpoint checkpoint starts at instruction 100000000 (90000000 with warmup).
perlbench_s_peak.mytest-64.txt