Skip to content

malloc(): bad size (unsorted) when BTB too large #3188

@muke101

Description

@muke101

Describe the bug
When the size of SimpleBTB is configured too large (>8-way with 4096 entries) with a larger O3 model, Gem5 can occasionally crash with 'malloc(): bad size (unsorted)'.

I run into this when restoring a simpoint checkpoint for the second Spec2017 perlbench workload.

Affects version
Stable

To Reproduce

In BranchPredictor.py:

class SimpleBTB(BranchTargetBuffer):
    numEntries = Param.Unsigned(8192, "Number of BTB entries")
    ...
    associativity = Param.Unsigned(8, "BTB associativity") 
    ...

Likewise could keep numEntries at 4096 but double associativity to 16.

BaseO3CPU.py:

from m5.citations import add_citation
from m5.defines import buildEnv
from m5.objects.BaseCPU import BaseCPU

# from m5.objects.O3Checker import O3Checker
from m5.objects.BranchPredictor import *
from m5.objects.FUPool import *
from m5.objects.IndexingPolicies import *
from m5.objects.IQUnit import *
from m5.objects.ReplacementPolicies import *
from m5.objects.SMT import *
from m5.params import *
from m5.proxy import *
from m5.SimObject import *

class BaseO3CPU(BaseCPU):
    type = "BaseO3CPU"
    cxx_class = "gem5::o3::CPU"
    cxx_header = "cpu/o3/dyn_inst.hh"

    @classmethod
    def memory_mode(cls):
        return "timing"

    @classmethod
    def require_caches(cls):
        return True

    @classmethod
    def support_take_over(cls):
        return True

    activity = Param.Unsigned(0, "Initial count")

    cacheStorePorts = Param.Unsigned(
        700, "Cache Ports. Constrains stores only."
    )
    cacheLoadPorts = Param.Unsigned(700, "Cache Ports. Constrains loads only.")

    # Backward pipeline delays
    fetchToBacDelay = Param.Cycles(1, "Fetch to Branch address calc. delay")
    decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
    renameToFetchDelay = Param.Cycles(1, "Rename to fetch delay")
    iewToFetchDelay = Param.Cycles(1, "Issue/Execute/Writeback to fetch delay")
    commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
    fetchWidth = Param.Unsigned(6, "Fetch width")
    fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
    fetchQueueSize = Param.Unsigned(
        128, "Fetch queue size in micro-ops per-thread"
    )

    renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
    iewToDecodeDelay = Param.Cycles(
        1, "Issue/Execute/Writeback to decode delay"
    )
    commitToDecodeDelay = Param.Cycles(1, "Commit to decode delay")

    # Forward pipeline delays
    bacToFetchDelay = Param.Cycles(1, "Branch address calc. to fetch delay")
    fetchToDecodeDelay = Param.Cycles(1, "Fetch to decode delay")
    decodeWidth = Param.Unsigned(6, "Decode width")

    iewToRenameDelay = Param.Cycles(
        1, "Issue/Execute/Writeback to rename delay"
    )
    commitToRenameDelay = Param.Cycles(1, "Commit to rename delay")
    decodeToRenameDelay = Param.Cycles(1, "Decode to rename delay")
    renameWidth = Param.Unsigned(6, "Rename width")

    commitToIEWDelay = Param.Cycles(
        1, "Commit to Issue/Execute/Writeback delay"
    )
    renameToIEWDelay = Param.Cycles(
        2, "Rename to Issue/Execute/Writeback delay"
    )
    issueToExecuteDelay = Param.Cycles(
        1, "Issue to execute delay (internal to the IEW stage)"
    )
    dispatchWidth = Param.Unsigned(6, "Dispatch width")
    issueWidth = Param.Unsigned(12, "Issue width")
    wbWidth = Param.Unsigned(12, "Writeback width")

    iewToCommitDelay = Param.Cycles(
        1, "Issue/Execute/Writeback to commit delay"
    )
    renameToROBDelay = Param.Cycles(1, "Rename to reorder buffer delay")
    commitWidth = Param.Unsigned(6, "Commit width")
    squashWidth = OptionalParam.Unsigned(
        "Squash width. If unspecified all instructions are "
        "squashed instantly within one cycle.",
    )
    trapLatency = Param.Cycles(13, "Trap latency")
    fetchTrapLatency = Param.Cycles(1, "Fetch trap latency")

    backComSize = Param.Unsigned(
        12, "Time buffer size for backwards communication"
    )
    forwardComSize = Param.Unsigned(
        12, "Time buffer size for forward communication"
    )

    LQEntries = Param.Unsigned(192, "Number of load queue entries")
    SQEntries = Param.Unsigned(114, "Number of store queue entries")
    LSQDepCheckShift = Param.Unsigned(
        4, "Number of places to shift addr before check"
    )
    LSQForwardingLatency = Param.Unsigned(
        4, "Latency of store forwarding in cycles"
    )
    LSQCheckLoads = Param.Bool(
        True,
        "Should dependency violations be checked for "
        "loads & stores or just stores",
    )
    store_set_clear_period = Param.Unsigned(
        128*244,
        "Number of load/store insts before the dep predictor "
        "should be invalidated",
    )
    store_set_clear_thres = Param.Unsigned(128*224, "Number of cycles before store sets should be invalided (XiangShan variant)")
    LFSTSize = Param.Unsigned(4096, "Last fetched store table size")
    LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain (XiangShan variant)")
    SSITSize = Param.MemorySize("8192", "Store set ID table size")
    SSITAssoc = Param.Unsigned(1, "SSIT table associativity")
    SSITReplPolicy = Param.BaseReplacementPolicy(
        LRURP(), "SSIT replacement policy"
    )
    SSITIndexingPolicy = Param.BaseIndexingPolicy(
        SetAssociative(
            size=Parent.SSITSize * 4,
            assoc=Parent.SSITAssoc,
            entry_size=4,
        ),
        "SSIT indexing policy",
    )

    phast_num_rows = Param.Unsigned(128, "Number of rows per table")
    phast_associativity = Param.Unsigned(4, "Number of entries per row")
    phast_tag_bits = Param.Unsigned(16, "Size of entry tags")
    phast_max_counter = Param.Unsigned(16, "Max confidence counter value")

    mdpReadPorts = Param.Unsigned(4, "Max MDP lookups per cycle")
    storesDoLookups = Param.Bool(False, "Whether stores query the MDP")

    numRobs = Param.Unsigned(1, "Number of Reorder Buffers")

    numPhysIntRegs = Param.Unsigned(
        512, "Number of physical integer registers"
    )
    numPhysFloatRegs = Param.Unsigned(
        512, "Number of physical floating point registers"
    )
    numPhysVecRegs = Param.Unsigned(512, "Number of physical vector registers")
    numPhysVecPredRegs = Param.Unsigned(
        64, "Number of physical predicate registers"
    )
    numPhysMatRegs = Param.Unsigned(8, "Number of physical matrix registers")
    # most ISAs don't use condition-code regs, so default is 0
    numPhysCCRegs = Param.Unsigned(0, "Number of physical cc registers")
    numROBEntries = Param.Unsigned(512, "Number of reorder buffer entries")
    instQueues = VectorParam.IQUnit(IQUnit(), "Vector of IQs")

    smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching Threads")
    smtFetchPolicy = Param.SMTFetchPolicy("RoundRobin", "SMT Fetch policy")
    smtLSQPolicy = Param.SMTQueuePolicy(
        "Partitioned", "SMT LSQ Sharing Policy"
    )
    smtLSQThreshold = Param.Int(100, "SMT LSQ Threshold Sharing Parameter")
    smtIQPolicy = Param.SMTQueuePolicy("Partitioned", "SMT IQ Sharing Policy")
    smtIQThreshold = Param.Int(100, "SMT IQ Threshold Sharing Parameter")
    smtROBPolicy = Param.SMTQueuePolicy(
        "Partitioned", "SMT ROB Sharing Policy"
    )
    smtROBThreshold = Param.Int(100, "SMT ROB Threshold Sharing Parameter")
    smtCommitPolicy = Param.CommitPolicy("RoundRobin", "SMT Commit Policy")

    branchPred = Param.BranchPredictor(
        BranchPredictor(conditionalBranchPred=TournamentBP(),
                        btb=SimpleBTB(),
                        numThreads=Parent.numThreads,
        ),
        "Branch Predictor",
    )

    needsTSO = Param.Bool(False, "Enable TSO Memory model")

    recvRespThrottling = Param.Bool(
        False, "Enable load receive response throttling in the LSQ"
    )
    recvRespMaxCachelines = Param.Unsigned(
        1,
        "Maximum number of different receive response cachelines per cycle",
    )
    recvRespBufferSize = Param.Unsigned(
        64, "Maximum number of receive response bytes per cycle"
    )

    ## Parameters for decoupled front-end
    decoupledFrontEnd = Param.Bool(False, "Enables the decoupled front-end")
    numFTQEntries = Param.Unsigned(
        8,
        "Number of entries in the Fetch target queue. (only used for "
        "decoupled front-end)",
    )
    minInstSize = Param.Unsigned(
        1,
        "Minimum instruction size (bytes). Determines the granularity "
        "of the instruction minimum search width per cycle",
    )
    fetchTargetWidth = Param.Unsigned(
        64,
        "Max width (bytes) of Fetch target. "
        "Determines the maximum search width per cycle",
    )
    maxFTPerCycle = Param.Unsigned(4, "Max number of FT created per cycle")
    maxTakenPredPerCycle = Param.Unsigned(
        1, "Max number of taken predictions per cycle"
    )


add_citation(
    BaseO3CPU,
    """@inproceedings{10.1145/3613424.3614258,
  author    = {Schall, David and
               Sandberg, Andreas and
               Grot, Boris},
  title     = {Warming Up a Cold Front-End with Ignite},
  year      = {2023},
  publisher = {Association for Computing Machinery},
  address   = {Toronto, ON, Canada},
  doi       = {10.1145/3613424.3614258},
  booktitle = {Proceedings of the 56th Annual IEEE/ACM International Symposium on Microarchitecture (MICRO '23)},
  series    = {MICRO'23}
}
""",
)

Simulation command:
/path/to/gem5//build/ARM/gem5.fast /path/to/gem5/configs/deprecated/example/se.py --cpu-type=DerivO3CPU --caches --l2cache --mem-type=DDR4_2400_8x8 --restore-simpoint-checkpoint -r 1 --checkpoint-dir /path/to/checkpoints --restore-with-cpu=AtomicSimpleCPU --mem-size=16GiB -c ./perlbench_s_peak.mytest-64 '--options=-I./lib diffmail.pl 4 800 10 17 19 300' --l1d_size=64KiB --l1i_size=32KiB --l2_size=1MB --l1d_assoc=8 --l1i_assoc=8 --l2_assoc=16 --l1i-hwp-type=StridePrefetcher --l1d-hwp-type=StridePrefetcher --l2-hwp-type=StridePrefetcher -P 'system.switch_cpus[:].LSQDepCheckShift=0'

Gem5 compiled with:
scons build/ARM/gem5.fast --ignore-style --linker=gold

Host Operating System
Ubuntu 24.04

Host ISA
X86

Compiler used
GCC 13

Additional information

I've attached the binary the checkpoint was generated from (I appended .txt to make it a supported file type). The simpoint checkpoint starts at instruction 100000000 (90000000 with warmup).
perlbench_s_peak.mytest-64.txt

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugcpu-o3gem5's Out-Of-Order CPU

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions