diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py index 4a8c79880f..7cb14af14b 100644 --- a/configs/common/CacheConfig.py +++ b/configs/common/CacheConfig.py @@ -170,17 +170,14 @@ def config_cache(options, system): system.tol3bus = L2XBar(clk_domain=system.cpu_clk_domain, width=256) system.l3.cpu_side = system.tol3bus.mem_side_ports system.l3.mem_side = system.membus.cpu_side_ports - system.l3.max_cache_level = 3 for i in range(options.num_cpus): if options.l3cache: # l2 -> tol3bus -> l3 system.l2_caches[i].mem_side = system.tol3bus.cpu_side_ports # l3 -> membus - system.l2_caches[i].max_cache_level = 3 else: system.l2_caches[i].mem_side = system.membus.cpu_side_ports - system.l2_caches[i].max_cache_level = 2 if options.memchecker: system.memchecker = MemChecker() @@ -189,14 +186,7 @@ def config_cache(options, system): if options.caches: icache = icache_class(**_get_cache_opts('l1i', options)) dcache = dcache_class(**_get_cache_opts('l1d', options)) - if options.l2cache: - icache.max_cache_level = 2 - dcache.max_cache_level = 2 - if options.l3cache: - icache.max_cache_level = 3 - dcache.max_cache_level = 3 if dcache.prefetcher != NULL: - print("Add dtb for L1D prefetcher") dcache.prefetcher.registerTLB(system.cpu[i].mmu.dtb) if options.l1d_hwp_type == 'XSCompositePrefetcher': if options.l1d_enable_spp: @@ -233,7 +223,6 @@ def config_cache(options, system): dcache.prefetcher.add_pf_downstream(system.l2_caches[i].prefetcher) system.l2_caches[i].prefetcher.queue_size = 64 system.l2_caches[i].prefetcher.max_prefetch_requests_with_pending_translation = 128 - print("Add L2 prefetcher {} as downstream of L1D prefetcher {}".format(i, i)) if options.l3cache and options.l2_to_l3_pf_hint: assert system.l2_caches[i].prefetcher != NULL and \ @@ -241,7 +230,6 @@ def config_cache(options, system): system.l2_caches[i].prefetcher.add_pf_downstream(system.l3.prefetcher) system.l3.prefetcher.queue_size = 64 system.l3.prefetcher.max_prefetch_requests_with_pending_translation = 128 - print("Add L3 prefetcher as downstream of L2 prefetcher {}".format(i)) # If we have a walker cache specified, instantiate two # instances here diff --git a/configs/common/Caches.py b/configs/common/Caches.py index a2cb1ba20b..145ad665cd 100644 --- a/configs/common/Caches.py +++ b/configs/common/Caches.py @@ -89,8 +89,6 @@ class L2Cache(Cache): mshrs = 64 tgts_per_mshr = 20 clusivity='mostly_incl' - prefetch_on_access = True - #prefetch_on_access = False # always writeback clean when lower level is exclusive writeback_clean = True @@ -110,7 +108,6 @@ class L3Cache(Cache): tgts_per_mshr = 20 clusivity='mostly_excl' writeback_clean = False - prefetch_on_access = True # aligned latency: tag_latency = 2 diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py new file mode 100644 index 0000000000..5e46a4518d --- /dev/null +++ b/configs/common/PrefetcherConfig.py @@ -0,0 +1,52 @@ +import m5 +from m5.objects import * +from common.Caches import * +from common import ObjectList + + +def _get_hwp(hwp_option): + if hwp_option == None: + return NULL + + hwpClass = ObjectList.hwp_list.get(hwp_option) + return hwpClass() + +def create_prefetcher(cpu, cache_level, options): + prefetcher_attr = '{}_hwp_type'.format(cache_level) + prefetcher_name = '' + prefetcher = NULL + if hasattr(options, prefetcher_attr): + prefetcher_name = getattr(options, prefetcher_attr) + prefetcher = _get_hwp(prefetcher_name) + print(f"create_prefetcher at {cache_level}: {prefetcher_name}") + + if prefetcher == NULL: + return NULL + + if cpu != NULL: + prefetcher.registerTLB(cpu.mmu.dtb) + + prefetcher.queue_size = 64 + + if prefetcher_name == 'XSCompositePrefetcher': + if options.l1d_enable_spp: + prefetcher.enable_spp = True + if options.l1d_enable_cplx: + prefetcher.enable_cplx = True + prefetcher.pht_pf_level = options.pht_pf_level + prefetcher.short_stride_thres = options.short_stride_thres + prefetcher.fuzzy_stride_matching = False + prefetcher.stream_pf_ahead = True + prefetcher.bop_large.delay_queue_enable = True + prefetcher.bop_large.bad_score = 10 + prefetcher.bop_small.delay_queue_enable = True + prefetcher.bop_small.bad_score = 5 + prefetcher.queue_size = 128 + prefetcher.max_prefetch_requests_with_pending_translation = 128 + prefetcher.region_size = 64*16 # 64B * blocks per region + + prefetcher.berti.use_byte_addr = True + prefetcher.berti.aggressive_pf = False + prefetcher.berti.trigger_pht = True + + return prefetcher \ No newline at end of file diff --git a/configs/common/cores/arm/O3_ARM_v7a.py b/configs/common/cores/arm/O3_ARM_v7a.py index d032a1aa88..5bef6251dc 100644 --- a/configs/common/cores/arm/O3_ARM_v7a.py +++ b/configs/common/cores/arm/O3_ARM_v7a.py @@ -179,7 +179,6 @@ class O3_ARM_v7aL2(Cache): size = '1MB' assoc = 16 write_buffers = 8 - prefetch_on_access = True clusivity = 'mostly_excl' # Simple stride prefetcher prefetcher = StridePrefetcher(degree=8, latency = 1) diff --git a/configs/common/cores/arm/ex5_LITTLE.py b/configs/common/cores/arm/ex5_LITTLE.py index 57f6a6b812..ba2504c74e 100644 --- a/configs/common/cores/arm/ex5_LITTLE.py +++ b/configs/common/cores/arm/ex5_LITTLE.py @@ -122,7 +122,6 @@ class L2(Cache): size = '512kB' assoc = 8 write_buffers = 16 - prefetch_on_access = True clusivity = 'mostly_excl' # Simple stride prefetcher prefetcher = StridePrefetcher(degree=1, latency = 1) diff --git a/configs/common/cores/arm/ex5_big.py b/configs/common/cores/arm/ex5_big.py index de7a45063a..c5cad4957a 100644 --- a/configs/common/cores/arm/ex5_big.py +++ b/configs/common/cores/arm/ex5_big.py @@ -174,7 +174,6 @@ class L2(Cache): size = '2MB' assoc = 16 write_buffers = 8 - prefetch_on_access = True clusivity = 'mostly_excl' # Simple stride prefetcher prefetcher = StridePrefetcher(degree=8, latency = 1) diff --git a/configs/ruby/CHI.py b/configs/ruby/CHI.py index a9e8eeb7e6..683e6abd9e 100644 --- a/configs/ruby/CHI.py +++ b/configs/ruby/CHI.py @@ -111,25 +111,29 @@ def create_system( # dataAccessLatency may be set to 0 if one wants to consider parallel # data and tag lookups class L1ICache(RubyCache): + level = 1 dataAccessLatency = 1 tagAccessLatency = 1 size = options.l1i_size assoc = options.l1i_assoc class L1DCache(RubyCache): + level = 1 dataAccessLatency = 0 tagAccessLatency = 1 size = options.l1d_size assoc = options.l1d_assoc class L2Cache(RubyCache): - dataAccessLatency = 6 + level = 2 + dataAccessLatency = 13 tagAccessLatency = 2 size = options.l2_size assoc = options.l2_assoc class HNFCache(RubyCache): - dataAccessLatency = 10 + level = 3 + dataAccessLatency = 17 tagAccessLatency = 2 size = options.l3_size assoc = options.l3_assoc @@ -154,11 +158,13 @@ class HNFCache(RubyCache): L1ICache, L1DCache, system.cache_line_size.value, + options ) for cpu in cpus ] + for rnf in ruby_system.rnf: - rnf.addPrivL2Cache(L2Cache) + rnf.addPrivL2Cache(L2Cache, options) cpu_sequencers.extend(rnf.getSequencers()) all_cntrls.extend(rnf.getAllControllers()) network_nodes.append(rnf) @@ -191,10 +197,17 @@ class HNFCache(RubyCache): hnf_list = [i for i in range(options.num_l3caches)] CHI_HNF.createAddrRanges(sysranges, system.cache_line_size.value, hnf_list) ruby_system.hnf = [ - CHI_HNF(i, ruby_system, HNFCache, None) + CHI_HNF(i, ruby_system, HNFCache, options, None) for i in range(options.num_l3caches) ] + if options.l2_to_l3_pf_hint: + if len(ruby_system.hnf) > 1: + Warning("L2 to L3 prefetch hint is not supported with multiple HNFs") + else: + for rnf in ruby_system.rnf: + rnf.addLLCPrefetcherDownstream(ruby_system.hnf[0].getPrefetcher()) + for hnf in ruby_system.hnf: network_nodes.append(hnf) network_cntrls.extend(hnf.getNetworkSideControllers()) diff --git a/configs/ruby/CHI_config.py b/configs/ruby/CHI_config.py index b04d144207..9df852f83c 100644 --- a/configs/ruby/CHI_config.py +++ b/configs/ruby/CHI_config.py @@ -49,7 +49,7 @@ import m5 from m5.objects import * - +from common.PrefetcherConfig import create_prefetcher class Versions: """ @@ -86,9 +86,9 @@ class NoC_Params: router_link_latency = 1 node_link_latency = 1 router_latency = 1 - router_buffer_size = 4 - cntrl_msg_size = 8 - data_width = 32 + router_buffer_size = 128 + cntrl_msg_size = 0 + data_width = 64 cross_links = [] cross_link_latency = 0 @@ -222,6 +222,10 @@ def __init__(self, ruby_system): # timeouts on unique lines when a store conditional fails self.sc_lock_enabled = False + def setPrefetcher(self, pf): + self.prefetcher = pf + self.use_prefetcher = pf != NULL + class CHI_L1Controller(CHI_Cache_Controller): """ @@ -249,11 +253,11 @@ def __init__(self, ruby_system, sequencer, cache, prefetcher, is_dcache=False): self.alloc_on_atomic = False self.dealloc_on_unique = False self.dealloc_on_shared = False - self.dealloc_backinv_unique = True - self.dealloc_backinv_shared = True + self.dealloc_backinv_unique = False + self.dealloc_backinv_shared = False self.is_dcache = is_dcache # Some reasonable default TBE params - self.number_of_TBEs = 16 + self.number_of_TBEs = 32+8 self.number_of_repl_TBEs = 16 self.number_of_snoop_TBEs = 4 self.number_of_DVM_TBEs = 16 @@ -261,6 +265,9 @@ def __init__(self, ruby_system, sequencer, cache, prefetcher, is_dcache=False): self.unify_repl_TBEs = False + self.response_latency = 4 + self.request_latency = 1 + class CHI_L2Controller(CHI_Cache_Controller): """ @@ -288,16 +295,18 @@ def __init__(self, ruby_system, cache, prefetcher): self.alloc_on_atomic = False self.dealloc_on_unique = False self.dealloc_on_shared = False - self.dealloc_backinv_unique = True - self.dealloc_backinv_shared = True + self.dealloc_backinv_unique = False + self.dealloc_backinv_shared = False # Some reasonable default TBE params - self.number_of_TBEs = 32 + self.number_of_TBEs = 64+16 self.number_of_repl_TBEs = 32 - self.number_of_snoop_TBEs = 16 + self.number_of_snoop_TBEs = 32 self.number_of_DVM_TBEs = 1 # should not receive any dvm self.number_of_DVM_snoop_TBEs = 1 # should not receive any dvm self.unify_repl_TBEs = False + self.response_latency = 12 + self.request_latency = 1 class CHI_HNFController(CHI_Cache_Controller): """ @@ -329,13 +338,17 @@ def __init__(self, ruby_system, cache, prefetcher, addr_ranges): self.dealloc_backinv_unique = False self.dealloc_backinv_shared = False # Some reasonable default TBE params - self.number_of_TBEs = 32 + self.number_of_TBEs = 256 + 32 self.number_of_repl_TBEs = 32 self.number_of_snoop_TBEs = 1 # should not receive any snoop self.number_of_DVM_TBEs = 1 # should not receive any dvm self.number_of_DVM_snoop_TBEs = 1 # should not receive any dvm self.unify_repl_TBEs = False + self.response_latency = 40 + self.request_latency = 1 + + class CHI_MNController(MiscNode_Controller): """ @@ -466,8 +479,7 @@ def __init__( l1Icache_type, l1Dcache_type, cache_line_size, - l1Iprefetcher_type=None, - l1Dprefetcher_type=None, + options ): super().__init__(ruby_system) @@ -505,16 +517,13 @@ def __init__( start_index_bit=self._block_size_bits, is_icache=False ) - # prefetcher wrappers - if l1Iprefetcher_type != None: - l1i_pf = l1Iprefetcher_type() - else: - l1i_pf = NULL + # create icache prefetcher + l1i_pf = NULL - if l1Dprefetcher_type != None: - l1d_pf = l1Dprefetcher_type() - else: - l1d_pf = NULL + # create dcache prefetcher + l1d_pf = create_prefetcher(cpu, 'l1d', options) + if l1d_pf != NULL and options.cpu_type == 'DerivO3CPU': + cpu.add_pf_downstream(l1d_pf) # cache controllers cpu.l1i = CHI_L1Controller( @@ -552,30 +561,38 @@ def setDownstream(self, cntrls): def getCpus(self): return self._cpus + def getL1DCachePrefetcher(self, cpu_idx): + return self._cpus[cpu_idx].l1d.prefetcher + + def getL2CachePrefetcher(self, cpu_idx): + return self._cpus[cpu_idx].l2.prefetcher + # Adds a private L2 for each cpu - def addPrivL2Cache(self, cache_type, pf_type=None): + def addPrivL2Cache(self, cache_type, options): self._ll_cntrls = [] for cpu in self._cpus: l2_cache = cache_type( start_index_bit=self._block_size_bits, is_icache=False ) - if pf_type != None: - l2_pf = pf_type() - else: - l2_pf = NULL + l2_pf = create_prefetcher(NULL, 'l2', options) + if l2_pf != NULL and options.l1_to_l2_pf_hint: + cpu.l1d.prefetcher.add_pf_downstream(l2_pf) cpu.l2 = CHI_L2Controller(self._ruby_system, l2_cache, l2_pf) self._cntrls.append(cpu.l2) self.connectController(cpu.l2) - self._ll_cntrls.append(cpu.l2) for c in cpu._ll_cntrls: c.downstream_destinations = [cpu.l2] cpu._ll_cntrls = [cpu.l2] + def addLLCPrefetcherDownstream(self, llc_pf): + for cpu in self._cpus: + cpu.l2.prefetcher.add_pf_downstream(llc_pf) + class CHI_HNF(CHI_Node): """ @@ -618,16 +635,18 @@ def getAddrRanges(cls, hnf_idx): # The CHI controller can be a child of this object or another if # 'parent' if specified - def __init__(self, hnf_idx, ruby_system, llcache_type, parent): + def __init__(self, hnf_idx, ruby_system, llcache_type, options, parent): super().__init__(ruby_system) addr_ranges, intlvHighBit = self.getAddrRanges(hnf_idx) # All ranges should have the same interleaving assert len(addr_ranges) >= 1 + llc_pf = create_prefetcher(NULL, 'l3', options) + ll_cache = llcache_type(start_index_bit=intlvHighBit + 1) self._cntrl = CHI_HNFController( - ruby_system, ll_cache, NULL, addr_ranges + ruby_system, ll_cache, llc_pf, addr_ranges ) if parent == None: @@ -637,6 +656,9 @@ def __init__(self, hnf_idx, ruby_system, llcache_type, parent): self.connectController(self._cntrl) + def getPrefetcher(self): + return self._cntrl.prefetcher + def getAllControllers(self): return [self._cntrl] @@ -698,6 +720,7 @@ def __init__(self, ruby_system, parent): requestToMemory=MemCtrlMessageBuffer(), reqRdy=TriggerMessageBuffer(), transitions_per_cycle=1024, + number_of_TBEs = 1024 ) # The Memory_Controller implementation deallocates the TBE for diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py index 305f6f5ccb..0e31196bab 100644 --- a/configs/ruby/Ruby.py +++ b/configs/ruby/Ruby.py @@ -69,7 +69,7 @@ def define_options(parser): "--ruby-clock", action="store", type=str, - default="2GHz", + default="3GHz", help="Clock for blocks running at Ruby system's speed", ) diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index 23ed3833ff..5b7d16bb00 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -112,7 +112,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst) } // check bypass data ready or not if (!scheduler->bypassScoreboard[src->flatIndex()]) { - panic("[sn %lu] %s can't get data from bypassNetwork\n", inst->seqNum, inst->srcRegIdx(i)); + auto dst_inst = scheduler->getInstByDstReg(src->flatIndex()); + panic("[sn %lu] %s can't get data from bypassNetwork, dst inst: %s\n", inst->seqNum, inst->srcRegIdx(i), + dst_inst->genDisassembly()); } } inst->checkOldVdElim(); @@ -578,6 +580,20 @@ Scheduler::full(const DynInstPtr& inst) return true; } +DynInstPtr +Scheduler::getInstByDstReg(RegIndex flatIdx) +{ + for (auto iq : issueQues) + { + for (auto& inst : iq->instList){ + if (inst->numDestRegs() > 0 && inst->renamedDestIdx(0)->flatIndex() == flatIdx) { + return inst; + } + } + } + return nullptr; +} + void Scheduler::addProducer(const DynInstPtr& inst) { @@ -693,6 +709,9 @@ Scheduler::insertSlot(const DynInstPtr& inst) void Scheduler::loadCancel(const DynInstPtr& inst) { + if (inst->canceled()) { + return; + } DPRINTF(Schedule, "[sn %lu] %s cache miss, cancel consumers\n", inst->seqNum, enums::OpClassStrings[inst->opClass()]); inst->setCancel(); diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index 867e808e57..d569de074a 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -218,6 +218,7 @@ class Scheduler : public SimObject void issueAndSelect(); bool full(const DynInstPtr& inst); bool ready(const DynInstPtr& inst); + DynInstPtr getInstByDstReg(RegIndex flatIdx); void addProducer(const DynInstPtr& inst); // return true if insert successful diff --git a/src/dev/riscv/nemu_mmc.cc b/src/dev/riscv/nemu_mmc.cc index fc30e28c8c..255d31f449 100644 --- a/src/dev/riscv/nemu_mmc.cc +++ b/src/dev/riscv/nemu_mmc.cc @@ -17,7 +17,6 @@ NemuMMC::NemuMMC(const NemuMMCParams *p) , write_cmd(false) , read_ext_csd(false) { - printf("111\n"); assert(C_SIZE < (1 << 12)); sd_reg_base = (uint32_t *)malloc(0x80); img_fp = fopen(p->img_path.c_str(), "rb"); @@ -31,7 +30,6 @@ NemuMMC::NemuMMC(const NemuMMCParams *p) inline void NemuMMC::prepare_rw(int is_write) { - printf("222\n"); blk_addr = sd_reg_base[SDARG]; tmp_addr = 0; if (img_fp) @@ -42,7 +40,6 @@ NemuMMC::prepare_rw(int is_write) void NemuMMC::sdcard_handle_cmd(int cmd) { - // printf("333\n"); switch (cmd) { case MMC_GO_IDLE_STATE: break; @@ -101,7 +98,6 @@ NemuMMC::sdcard_handle_cmd(int cmd) void NemuMMC::sdcard_io_handler(uint32_t offset) { - // printf("444\n"); assert(img_fp); int idx = offset / 4; switch (idx) { @@ -149,7 +145,6 @@ NemuMMC::sdcard_io_handler(uint32_t offset) void NemuMMC::unserialize_sdcard(FILE *sdfp) { - // printf("555\n"); __attribute__((unused)) int ret; ret = fread(sd_reg_base, 4, 0x80 / 4, sdfp); ret = fread(&tmp_addr, 4, 1, sdfp); @@ -163,7 +158,6 @@ NemuMMC::unserialize_sdcard(FILE *sdfp) Tick NemuMMC::read(PacketPtr pkt) { - // printf("666\n"); assert(pkt->getSize() == 4); Addr offset = pkt->getAddr() - pioAddr; // handler before read @@ -180,7 +174,6 @@ NemuMMC::read(PacketPtr pkt) Tick NemuMMC::write(PacketPtr pkt) { - //printf("777\n"); assert(pkt->getSize() == 4); Addr offset = pkt->getAddr() - pioAddr; uint32_t write_val = pkt->getRaw(); diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py index 791a2223c8..1a0aaa7768 100644 --- a/src/mem/cache/Cache.py +++ b/src/mem/cache/Cache.py @@ -98,11 +98,6 @@ class BaseCache(ClockedObject): enable_wayprediction = Param.Bool(True, "enablewaypredction") prefetcher = Param.BasePrefetcher(NULL,"Prefetcher attached to cache") - prefetch_on_access = Param.Bool(False, - "Notify the hardware prefetcher on every access (not just misses)") - prefetch_on_pf_hit = Param.Bool(False, - "Notify the hardware prefetcher on hit on prefetched lines") - tags = Param.BaseTags(BaseSetAssoc(), "Tag store") replacement_policy = Param.BaseReplacementPolicy(LRURP(), "Replacement policy") @@ -157,8 +152,6 @@ class BaseCache(ClockedObject): cache_level = Param.Unsigned(0, "Cache level (L1 is 1, L2 is 2, etc.)") - max_cache_level = Param.Unsigned(2, "Max Cache level (L1 is 1, L2 is 2, etc.)") - force_hit = Param.Bool(False, "Force some PC to hit in L1") way_entries = Param.MemorySize( "64", diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc index 38d5fff7a7..146dcf35a5 100644 --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -145,7 +145,6 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size) system(p.system), stats(*this), cacheLevel(p.cache_level), - maxCacheLevel(p.max_cache_level), forceHit(p.force_hit) { // the MSHR queue has no reserve entries as we check the MSHR @@ -166,7 +165,7 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size) assert(!((size != DEFAULTWAYPRESIZE) && enableWayPrediction)); if (prefetcher) - prefetcher->setCache(this); + prefetcher->setParentInfo(system, getProbeManager(), this, getBlockSize()); fatal_if(compressor && !dynamic_cast(tags), "The tags of compressed cache %s must derive from CompressedTags", @@ -1150,28 +1149,28 @@ BaseCache::getNextQueueEntry() PacketPtr pkt = prefetcher->getPacket(); if (pkt) { Addr pf_addr = pkt->getBlockAddr(blkSize); - int pf_num = pkt->req->getXsMetadata().prefetchSource; + PrefetchSourceType pf_type = pkt->req->getXsMetadata().prefetchSource; if (tags->findBlock(pf_addr, pkt->isSecure())) { DPRINTF(HWPrefetch, "Prefetch %#x has hit in cache, " "dropped.\n", pf_addr); - prefetcher->pfHitInCache(pf_num); - if (pf_num == 1) + prefetcher->pfHitInCache(pf_type); + if (pf_type == PrefetchSourceType::SStream) prefetcher->streamPflate(); // free the request and packet delete pkt; } else if (mshrQueue.findMatch(pf_addr, pkt->isSecure())) { DPRINTF(HWPrefetch, "Prefetch %#x has hit in a MSHR, " "dropped.\n", pf_addr); - prefetcher->pfHitInMSHR(pf_num); - if (pf_num == 1) + prefetcher->pfHitInMSHR(pf_type); + if (pf_type == PrefetchSourceType::SStream) prefetcher->streamPflate(); // free the request and packet delete pkt; } else if (writeBuffer.findMatch(pf_addr, pkt->isSecure())) { DPRINTF(HWPrefetch, "Prefetch %#x has hit in the " "Write Buffer, dropped.\n", pf_addr); - prefetcher->pfHitInWB(pf_num); - if (pf_num == 1) + prefetcher->pfHitInWB(pf_type); + if (pf_type == PrefetchSourceType::SStream) prefetcher->streamPflate(); // free the request and packet delete pkt; diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh index ab7607f4c5..54c0169086 100644 --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -60,6 +60,7 @@ #include "debug/CacheTrace.hh" #include "enums/Clusivity.hh" #include "mem/cache/cache_blk.hh" +#include "mem/cache/cache_probe_arg.hh" #include "mem/cache/compressors/base.hh" #include "mem/cache/mshr_queue.hh" #include "mem/cache/prefetch/associative_set.hh" @@ -95,7 +96,7 @@ struct BaseCacheParams; /** * A basic cache interface. Implements some common functions for speed. */ -class BaseCache : public ClockedObject +class BaseCache : public ClockedObject, CacheAccessor { protected: /** @@ -1365,50 +1366,6 @@ class BaseCache : public ClockedObject memSidePort.schedSendEvent(time); } - bool inCache(Addr addr, bool is_secure) const { - return tags->findBlock(addr, is_secure); - } - - bool hasBeenPrefetched(Addr addr, bool is_secure) const { - CacheBlk *block = tags->findBlock(addr, is_secure); - if (block) { - return block->wasEverPrefetched(); - } else { - return false; - } - } - - bool hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const { - CacheBlk *block = tags->findBlock(addr, is_secure); - if (block) { - return block->wasPrefetched(); - } else { - return false; - } - } - - CacheBlk* findBlock(Addr addr, bool is_secure){ - return tags->findBlock(addr, is_secure); - } - - Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) - { - CacheBlk *block = tags->findBlock(pkt->getAddr(), pkt->isSecure()); - assert(block); - /* clean prefetchSource if the block was not prefetched */ - if (!block->wasEverPrefetched()) { - Request::XsMetadata blkMeta = block->getXsMetadata(); - blkMeta.prefetchSource = PrefetchSourceType::PF_NONE; - block->setXsMetadata(blkMeta); - } - return block->getXsMetadata(); - } - - - bool inMissQueue(Addr addr, bool is_secure) const { - return mshrQueue.findMatch(addr, is_secure); - } - void incMissCount(PacketPtr pkt) { assert(pkt->req->requestorId() < system->maxRequestors()); @@ -1455,14 +1412,6 @@ class BaseCache : public ClockedObject } } - /** - * Checks if the cache is coalescing writes - * - * @return True if the cache is coalescing writes - */ - bool coalesce() const; - - /** * Cache block visitor that writes back dirty cache blocks using * functional writes. @@ -1508,7 +1457,7 @@ class BaseCache : public ClockedObject const unsigned cacheLevel{0}; - const unsigned maxCacheLevel; + //const unsigned maxCacheLevel; const bool dumpMissPC{false}; @@ -1520,7 +1469,60 @@ class BaseCache : public ClockedObject const bool forceHit; public: - unsigned level() { return cacheLevel; } + // CacheAccessor overrided function + + bool inCache(Addr addr, bool is_secure) const override { return tags->findBlock(addr, is_secure); } + + unsigned level() const override { return cacheLevel; } + + bool hasBeenPrefetched(Addr addr, bool is_secure) const override + { + CacheBlk *block = tags->findBlock(addr, is_secure); + if (block) { + return block->wasPrefetched(); + } else { + return false; + } + } + + bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) const override { + panic("hasBeenPrefetched Not implemented"); + return false; + } + + bool hasEverBeenPrefetched(Addr addr, bool is_secure) const override + { + CacheBlk *block = tags->findBlock(addr, is_secure); + if (block) { + return block->wasEverPrefetched(); + } else { + return false; + } + } + + Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) override + { + CacheBlk *block = tags->findBlock(pkt->getAddr(), pkt->isSecure()); + assert(block); + /* clean prefetchSource if the block was not prefetched */ + if (!block->wasEverPrefetched()) { + Request::XsMetadata blkMeta = block->getXsMetadata(); + blkMeta.prefetchSource = PrefetchSourceType::PF_NONE; + block->setXsMetadata(blkMeta); + } + return block->getXsMetadata(); + } + + bool inMissQueue(Addr addr, bool is_secure) const override { + return mshrQueue.findMatch(addr, is_secure); + } + + bool coalesce() const override; + + const uint8_t* findBlock(Addr addr, bool is_secure) const override { + auto blk = tags->findBlock(addr, is_secure); + return blk->data; + } }; /** diff --git a/src/mem/cache/cache_blk.hh b/src/mem/cache/cache_blk.hh index 5363b92dff..424914e799 100644 --- a/src/mem/cache/cache_blk.hh +++ b/src/mem/cache/cache_blk.hh @@ -206,7 +206,7 @@ class CacheBlk : public TaggedEntry { TaggedEntry::invalidate(); - clearPrefetched(); + clearAllPrefetched(); clearPendingInvalidate(); clearCoherenceBits(AllBits); diff --git a/src/mem/cache/cache_probe_arg.hh b/src/mem/cache/cache_probe_arg.hh new file mode 100644 index 0000000000..ea7b10d88a --- /dev/null +++ b/src/mem/cache/cache_probe_arg.hh @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2023 ARM Limited + * All rights reserved. + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef __MEM_CACHE_PROBE_ARG_HH__ +#define __MEM_CACHE_PROBE_ARG_HH__ + +#include "mem/packet.hh" + +namespace gem5 +{ + +/** + * Provides generic cache lookup functions. A cache may provide + * a CacheAccessor object to other components that need to perform + * a lookup outside the normal cache control flow. Currently this + * is used by prefetchers that perform lookups when notified by + * cache events. + */ +struct CacheAccessor +{ + /** Determine if address is in cache */ + virtual bool inCache(Addr addr, bool is_secure) const = 0; + + // cache level, l1 is 1, l2 is 2, etc. + virtual unsigned level() const = 0; + + /** Determine if address has been prefetched */ + virtual bool hasBeenPrefetched(Addr addr, bool is_secure) const = 0; + + /** Determine if address has been prefetched by the requestor */ + virtual bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) const = 0; + + virtual bool hasEverBeenPrefetched(Addr addr, bool is_secure) const = 0; + + virtual Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) = 0; + + /** Determine if address is in cache miss queue */ + virtual bool inMissQueue(Addr addr, bool is_secure) const = 0; + + /** Determine if cache is coalescing writes */ + virtual bool coalesce() const = 0; + + virtual const uint8_t* findBlock(Addr addr, bool is_secure) const = 0; +}; + +/** + * Information provided to probes on a cache event. + * @sa ppHit, ppMiss, ppFill in gem5::BaseCache (src/mem/cache/base.hh) + */ +class CacheAccessProbeArg +{ + public: + /** Packet that triggered the cache access*/ + PacketPtr pkt; + /** Accessor for the cache */ + CacheAccessor &cache; + + CacheAccessProbeArg(PacketPtr _pkt, CacheAccessor &_cache) + :pkt(_pkt), cache(_cache) + { + } +}; + +/** + * A data contents update is composed of the updated block's address, + * the old contents, and the new contents. + * @sa ppDataUpdate in gem5::BaseCache (src/mem/cache/base.hh) + */ +struct CacheDataUpdateProbeArg +{ + /** The updated block's address. */ + Addr addr; + /** Whether the block belongs to the secure address space. */ + bool isSecure; + /** Block original requestor */ + const RequestorID requestorID; + /** The stale data contents. If zero-sized this update is a fill. */ + std::vector oldData; + /** The new data contents. If zero-sized this is an invalidation. */ + std::vector newData; + /** Set if the update is from a prefetch or evicting a prefetched + * block that was never used. */ + bool hwPrefetched; + /** Accessor for the cache */ + CacheAccessor &accessor; + + CacheDataUpdateProbeArg(Addr _addr, bool is_secure, + RequestorID _requestorID, + CacheAccessor &_accessor) + : addr(_addr), isSecure(is_secure), requestorID(_requestorID), + oldData(), newData(), accessor(_accessor) + { + } +}; + +} // namespace gem5 + +#endif //__MEM_CACHE_PROBE_ARG_HH__ diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py index 5f6621e7c8..75649d9d94 100644 --- a/src/mem/cache/prefetch/Prefetcher.py +++ b/src/mem/cache/prefetch/Prefetcher.py @@ -77,9 +77,9 @@ class BasePrefetcher(ClockedObject): on_write = Param.Bool(True, "Notify prefetcher on writes") on_data = Param.Bool(True, "Notify prefetcher on data accesses") on_inst = Param.Bool(True, "Notify prefetcher on instruction accesses") - prefetch_on_access = Param.Bool(Parent.prefetch_on_access, + prefetch_on_access = Param.Bool(False, "Notify the hardware prefetcher on every access (not just misses)") - prefetch_on_pf_hit = Param.Bool(Parent.prefetch_on_pf_hit, + prefetch_on_pf_hit = Param.Bool(False, "Notify the hardware prefetcher on hit on prefetched lines") use_virtual_addresses = Param.Bool(False, "Use virtual addresses for prefetching") @@ -88,7 +88,6 @@ class BasePrefetcher(ClockedObject): is_sub_prefetcher = Param.Bool(False, "Is this a sub-prefetcher") - max_cache_level = Param.Unsigned(Parent.max_cache_level , "Max Cache level (L1 is 1, L2 is 2, etc.)") def __init__(self, **kwargs): super().__init__(**kwargs) @@ -104,10 +103,12 @@ def addEvent(self, newObject): def regProbeListeners(self): print("Registering probe listeners for Prefetcher {}".format(self)) for tlb in self._tlbs: + print(f"{self} addTLB {tlb}") self.getCCObject().addTLB(tlb.getCCObject()) assert len(self._downstream_pf) <= 1 if len(self._downstream_pf): + print(f"{self} addHintDownStream {self._downstream_pf[0]}") self.getCCObject().addHintDownStream(self._downstream_pf[0].getCCObject()) for event in self._events: @@ -329,6 +330,7 @@ class WorkerPrefetcher(QueuedPrefetcher): on_data = True on_miss = False + prefetch_on_access = True prefetch_on_pf_hit = True use_virtual_addresses = True diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc index 1731761cf7..0cbdcef5ad 100644 --- a/src/mem/cache/prefetch/base.cc +++ b/src/mem/cache/prefetch/base.cc @@ -126,7 +126,7 @@ Base::PrefetchListener::notify(const PacketPtr &pkt) Base::Base(const BasePrefetcherParams &p) : ClockedObject(p), - listeners(), cache(nullptr), isSubPrefetcher(p.is_sub_prefetcher), + listeners(), isSubPrefetcher(p.is_sub_prefetcher), archDBer(p.arch_db), blkSize(p.block_size), lBlkSize(floorLog2(blkSize)), onMiss(p.on_miss), onRead(p.on_read), onWrite(p.on_write), onData(p.on_data), onInst(p.on_inst), @@ -136,19 +136,20 @@ Base::Base(const BasePrefetcherParams &p) prefetchOnPfHit(p.prefetch_on_pf_hit), useVirtualAddresses(p.use_virtual_addresses), prefetchStats(this), issuedPrefetches(0), - usefulPrefetches(0), streamlatenum(0),tlb(nullptr), maxCacheLevel(p.max_cache_level), - probeManagerDirty(nullptr) + usefulPrefetches(0), streamlatenum(0),tlb(nullptr) { } void -Base::setCache(BaseCache *_cache) +Base::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) { - assert(!cache); + assert(!cache && !system && !probeManager); + system = sys; + probeManager = pm; cache = _cache; // If the cache has a different block size from the system's, save it - blkSize = cache->getBlockSize(); + blkSize = blk_size; lBlkSize = floorLog2(blkSize); } @@ -244,7 +245,7 @@ Base::observeAccess(const PacketPtr &pkt, bool miss) const if (!miss) { if (prefetchOnPfHit) - return hasBeenPrefetched(pkt->getAddr(), pkt->isSecure()); + return hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure()); if (!prefetchOnAccess) return false; } @@ -282,9 +283,9 @@ Base::hasBeenPrefetched(Addr addr, bool is_secure) const } bool -Base::hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const +Base::hasEverBeenPrefetched(Addr addr, bool is_secure) const { - return cache->hasBeenPrefetchedAndNotAccessed(addr, is_secure); + return cache->hasEverBeenPrefetched(addr, is_secure); } bool @@ -358,7 +359,7 @@ Base::probeNotify(const PacketPtr &pkt, bool miss) DPRINTF(HWPrefetch, "Reach condition checked\n"); - if (hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure())) { + if (hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())) { usefulPrefetches += 1; prefetchStats.pfUseful++; PrefetchSourceType pf_source = cache->getHitBlkXsMetadata(pkt).prefetchSource; @@ -386,9 +387,9 @@ Base::probeNotify(const PacketPtr &pkt, bool miss) PrefetchInfo pfi(pkt, pkt->req->hasVaddr() ? pkt->req->getVaddr() : pkt->req->getPaddr(), miss, Request::XsMetadata(pf_source, pf_depth)); pfi.setReqAfterSquash(squashMark); - pfi.setEverPrefetched(hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())); - pfi.setPfFirstHit(!miss && hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure())); - pfi.setPfHit(!miss && hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())); + pfi.setEverPrefetched(hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure())); + pfi.setPfFirstHit(!miss && hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())); + pfi.setPfHit(!miss && hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure())); squashMark = false; notify(pkt, pfi); } else { @@ -431,18 +432,11 @@ Base::regProbeListeners() * parent cache using the probe "Miss". Also connect to "Hit", if the * cache is configured to prefetch on accesses. */ - if (listeners.empty() && !isSubPrefetcher) { - assert((cache != nullptr) != (probeManagerDirty != nullptr)); - ProbeManager* pm(nullptr); - if (cache != nullptr) { - pm = cache->getProbeManager(); - } else if (probeManagerDirty != nullptr) { - pm = probeManagerDirty; - } - listeners.push_back(new PrefetchListener(*this, pm, "StorePFtrain", false, true, true)); - listeners.push_back(new PrefetchListener(*this, pm, "Miss", false, true, false)); - listeners.push_back(new PrefetchListener(*this, pm, "Fill", true, false, false)); - listeners.push_back(new PrefetchListener(*this, pm, "Hit", false, false, false)); + if (listeners.empty() && !isSubPrefetcher && probeManager != nullptr) { + listeners.push_back(new PrefetchListener(*this, probeManager, "StorePFtrain", false, true, true)); + listeners.push_back(new PrefetchListener(*this, probeManager, "Miss", false, true, false)); + listeners.push_back(new PrefetchListener(*this, probeManager, "Fill", true, false, false)); + listeners.push_back(new PrefetchListener(*this, probeManager, "Hit", false, false, false)); } } diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh index 71ecebcfb4..532af8aa07 100644 --- a/src/mem/cache/prefetch/base.hh +++ b/src/mem/cache/prefetch/base.hh @@ -52,7 +52,7 @@ #include "base/compiler.hh" #include "base/statistics.hh" #include "base/types.hh" -#include "mem/cache/cache_blk.hh" +#include "mem/cache/cache_probe_arg.hh" #include "mem/packet.hh" #include "mem/request.hh" #include "sim/arch_db.hh" @@ -63,7 +63,6 @@ namespace gem5 { -class BaseCache; struct BasePrefetcherParams; GEM5_DEPRECATED_NAMESPACE(Prefetcher, prefetch); @@ -332,14 +331,20 @@ class Base : public ClockedObject protected: + bool isSubPrefetcher; + + ArchDBer* archDBer; + // PARAMETERS /** Pointr to the parent cache. */ - BaseCache* cache; + CacheAccessor* cache = nullptr; - bool isSubPrefetcher; + /** Pointer to the parent system. */ + System* system = nullptr; - ArchDBer* archDBer; + /** Pointer to the parent cache's probe manager. */ + ProbeManager *probeManager = nullptr; /** The block size of the parent cache. */ unsigned blkSize; @@ -390,7 +395,7 @@ class Base : public ClockedObject bool inMissQueue(Addr addr, bool is_secure) const; bool hasBeenPrefetched(Addr addr, bool is_secure) const; - bool hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const; + bool hasEverBeenPrefetched(Addr addr, bool is_secure) const; /** Determine if addresses are on the same page */ bool samePage(Addr a, Addr b) const; @@ -458,20 +463,11 @@ class Base : public ClockedObject /** Registered tlb for address translations */ BaseTLB * tlb; - const unsigned maxCacheLevel; - - /** Proxied Prefetchers in ruby does not have cache as parent - * so need to set probe manager explicitly - */ - ProbeManager *probeManagerDirty; - public: Base(const BasePrefetcherParams &p); virtual ~Base() = default; - virtual void setCache(BaseCache *_cache); - - void setPMInfoDirty(ProbeManager *pm) { probeManagerDirty = pm; } + virtual void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size); /** * Notify prefetcher of cache access (may be any access or just @@ -501,27 +497,27 @@ class Base : public ClockedObject } void - pfHitInCache(int pf_num) + pfHitInCache(PrefetchSourceType pf_type) { prefetchStats.pfHitInCache++; - prefetchStats.pfHitInCache_srcs[pf_num]++; - prefetchStats.late_srcs[pf_num]++; + prefetchStats.pfHitInCache_srcs[pf_type]++; + prefetchStats.late_srcs[pf_type]++; } void - pfHitInMSHR(int pf_num) + pfHitInMSHR(PrefetchSourceType pf_type) { prefetchStats.pfHitInMSHR++; - prefetchStats.pfHitInMSHR_srcs[pf_num]++; - prefetchStats.late_srcs[pf_num]++; + prefetchStats.pfHitInMSHR_srcs[pf_type]++; + prefetchStats.late_srcs[pf_type]++; } void - pfHitInWB(int pf_num) + pfHitInWB(PrefetchSourceType pf_type) { prefetchStats.pfHitInWB++; - prefetchStats.pfHitInWB_srcs[pf_num]++; - prefetchStats.late_srcs[pf_num]++; + prefetchStats.pfHitInWB_srcs[pf_type]++; + prefetchStats.late_srcs[pf_type]++; } void streamPflate() { streamlatenum++; } diff --git a/src/mem/cache/prefetch/berti.cc b/src/mem/cache/prefetch/berti.cc index 7ec09a4c78..4e2a8492b2 100644 --- a/src/mem/cache/prefetch/berti.cc +++ b/src/mem/cache/prefetch/berti.cc @@ -309,8 +309,8 @@ BertiPrefetcher::notifyFill(const PacketPtr &pkt) } // fill latency - Cycles miss_refill_search_lat = ticksToCycles(200); - hitSearchLatency = ticksToCycles(50); + Cycles miss_refill_search_lat = Cycles(0); + hitSearchLatency = Cycles(0); HistoryTableEntry *entry = historyTable.findEntry(pcHash(pkt->req->getPC()), pkt->req->isSecure()); diff --git a/src/mem/cache/prefetch/cdp.cc b/src/mem/cache/prefetch/cdp.cc index 9775ee3a81..381c011366 100644 --- a/src/mem/cache/prefetch/cdp.cc +++ b/src/mem/cache/prefetch/cdp.cc @@ -104,7 +104,7 @@ CDP::calculatePrefetch(const PrefetchInfo &pfi, std::vector &addre PrefetchSourceType pf_source = pfi.getXsMetadata().prefetchSource; int pf_depth = pfi.getXsMetadata().prefetchDepth; bool is_prefetch = - cache->system->getRequestorName(pfi.getRequestorId()).find("dcache.prefetcher") != std::string::npos; + system->getRequestorName(pfi.getRequestorId()).find("dcache.prefetcher") != std::string::npos; if (!miss && pfi.getDataPtr() != nullptr) { if (is_prefetch && enable_prf_filter[pf_source]) { return; @@ -169,13 +169,13 @@ CDP::notifyWithData(const PacketPtr &pkt, bool is_l1_use, std::vectorreq->getVaddr(), pkt->getSize()); - auto *blk = cache->findBlock(pkt->getAddr(), pkt->isSecure()); - if (!blk) { + auto *blk_data = cache->findBlock(pkt->getAddr(), pkt->isSecure()); + if (!blk_data) { cdpStats.dataNotifyExitBlockNotFound++; return; } Request::XsMetadata pkt_meta = cache->getHitBlkXsMetadata(pkt); - size_t prefetch_type = cache->system->getRequestorName(pkt->req->requestorId()).find("dcache.prefetcher"); + size_t prefetch_type = system->getRequestorName(pkt->req->requestorId()).find("dcache.prefetcher"); int pf_depth = pkt_meta.prefetchDepth; PrefetchSourceType pf_source = pkt_meta.prefetchSource; if (!is_l1_use && prefetch_type != std::string::npos) { @@ -184,7 +184,7 @@ CDP::notifyWithData(const PacketPtr &pkt, bool is_l1_use, std::vectordata; + const uint64_t *test_addr_start = (const uint64_t *)blk_data; unsigned max_offset = blkSize / sizeof(uint64_t); switch (byteOrder) { case ByteOrder::big: diff --git a/src/mem/cache/prefetch/cmc.cc b/src/mem/cache/prefetch/cmc.cc index a313aaa012..156a199f22 100644 --- a/src/mem/cache/prefetch/cmc.cc +++ b/src/mem/cache/prefetch/cmc.cc @@ -135,7 +135,7 @@ CMCPrefetcher::doPrefetch(const PrefetchInfo &pfi, std::vector &ad // addresses.push_back(AddrPriority(addr, mixedNum, PrefetchSourceType::CMC)); if (sendPFWithFilter(pfi, addr, addresses, priority, PrefetchSourceType::CMC)) { num_send++; - if (maxCacheLevel == 3 && num_send > 24) { + if (num_send > 24) { addresses.back().pfahead = true; addresses.back().pfahead_host = 3; } else if (num_send > 4) { diff --git a/src/mem/cache/prefetch/composite_with_worker.cc b/src/mem/cache/prefetch/composite_with_worker.cc index 02cc83d565..b35de3c98a 100644 --- a/src/mem/cache/prefetch/composite_with_worker.cc +++ b/src/mem/cache/prefetch/composite_with_worker.cc @@ -41,7 +41,7 @@ CompositeWithWorkerPrefetcher::postNotifyInsert(const PacketPtr &trigger_pkt, st if (!samePage(addr_prio.addr, pfi.getAddr())) { statsQueued.pfSpanPage += 1; - if (hasBeenPrefetched(trigger_pkt->getAddr(), trigger_pkt->isSecure())) { + if (hasEverBeenPrefetched(trigger_pkt->getAddr(), trigger_pkt->isSecure())) { statsQueued.pfUsefulSpanPage += 1; } } @@ -65,9 +65,9 @@ CompositeWithWorkerPrefetcher::postNotifyInsert(const PacketPtr &trigger_pkt, st } void -CompositeWithWorkerPrefetcher::setCache(BaseCache *_cache) +CompositeWithWorkerPrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) { - Base::setCache(_cache); + Base::setParentInfo(sys, pm, _cache, blk_size); } } // namespace prefetch diff --git a/src/mem/cache/prefetch/composite_with_worker.hh b/src/mem/cache/prefetch/composite_with_worker.hh index f569199254..870615ae59 100644 --- a/src/mem/cache/prefetch/composite_with_worker.hh +++ b/src/mem/cache/prefetch/composite_with_worker.hh @@ -23,7 +23,7 @@ class CompositeWithWorkerPrefetcher: public WorkerPrefetcher bool hasHintsWaiting() override { return !localBuffer.empty(); } - void setCache(BaseCache *_cache) override; + void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override; void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override; diff --git a/src/mem/cache/prefetch/l2_composite_with_worker.cc b/src/mem/cache/prefetch/l2_composite_with_worker.cc index 83854e0292..62050eb340 100644 --- a/src/mem/cache/prefetch/l2_composite_with_worker.cc +++ b/src/mem/cache/prefetch/l2_composite_with_worker.cc @@ -60,11 +60,11 @@ L2CompositeWithWorkerPrefetcher::pfHitNotify(float accuracy, PrefetchSourceType } void -L2CompositeWithWorkerPrefetcher::setCache(BaseCache *_cache) +L2CompositeWithWorkerPrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) { - cdp->setCache(_cache); + cdp->setParentInfo(sys, pm, _cache, blk_size); cdp->setStatsPtr(&prefetchStats); - CompositeWithWorkerPrefetcher::setCache(_cache); + CompositeWithWorkerPrefetcher::setParentInfo(sys, pm, _cache, blk_size); } void diff --git a/src/mem/cache/prefetch/l2_composite_with_worker.hh b/src/mem/cache/prefetch/l2_composite_with_worker.hh index fcd7d6f62e..3cd0939c8f 100644 --- a/src/mem/cache/prefetch/l2_composite_with_worker.hh +++ b/src/mem/cache/prefetch/l2_composite_with_worker.hh @@ -29,7 +29,7 @@ class L2CompositeWithWorkerPrefetcher : public CompositeWithWorkerPrefetcher void rxHint(BaseMMU::Translation *dpp) override; void pfHitNotify(float accuracy, PrefetchSourceType pf_source, const PacketPtr &pkt) override; - void setCache(BaseCache *_cache) override; + void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override; void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override; diff --git a/src/mem/cache/prefetch/multi.cc b/src/mem/cache/prefetch/multi.cc index 8137e6a231..c67b634192 100644 --- a/src/mem/cache/prefetch/multi.cc +++ b/src/mem/cache/prefetch/multi.cc @@ -54,10 +54,10 @@ Multi::Multi(const MultiPrefetcherParams &p) } void -Multi::setCache(BaseCache *_cache) +Multi::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) { for (auto pf : prefetchers) - pf->setCache(_cache); + pf->setParentInfo(sys, pm, _cache, blk_size); } Tick diff --git a/src/mem/cache/prefetch/multi.hh b/src/mem/cache/prefetch/multi.hh index 46469729ca..140e8cda6b 100644 --- a/src/mem/cache/prefetch/multi.hh +++ b/src/mem/cache/prefetch/multi.hh @@ -57,7 +57,7 @@ class Multi : public Base Multi(const MultiPrefetcherParams &p); public: - void setCache(BaseCache *_cache) override; + void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override; void addTLB(BaseTLB* _t) override; PacketPtr getPacket() override; Tick nextPrefetchReadyTime() const override; diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc index 246351c00c..f38f6e91cc 100644 --- a/src/mem/cache/prefetch/queued.cc +++ b/src/mem/cache/prefetch/queued.cc @@ -256,7 +256,7 @@ Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi) if (!samePage(addr_prio.addr, pfi.getAddr())) { statsQueued.pfSpanPage += 1; - if (hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure())) { + if (hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())) { statsQueued.pfUsefulSpanPage += 1; } } @@ -379,7 +379,7 @@ Queued::translationComplete(DeferredPacket *dp, bool failed) statsQueued.pfInCache++; DPRINTF(HWPrefetch, "Dropping redundant in " "cache/MSHR prefetch addr:%#x\n", target_paddr); - } else if (target_paddr < 0x80000000) { + } else if (!system->isMemAddr(target_paddr)) { DPRINTF(HWPrefetch, "wrong paddr of prefetch:%#x\n", target_paddr); } else { @@ -552,7 +552,7 @@ Queued::insert(const PacketPtr &pkt, PrefetchInfo &new_pfi, const AddrPriority & "cache/MSHR prefetch addr:%#x\n", target_paddr); return; } - if (has_target_pa && (target_paddr < 0x80000000)) { + if (has_target_pa && !system->isMemAddr(target_paddr)) { DPRINTF(HWPrefetch, "wrong paddr of prefetch:%#x\n", target_paddr); return; } @@ -575,7 +575,7 @@ Queued::insert(const PacketPtr &pkt, PrefetchInfo &new_pfi, const AddrPriority & } else { // Add the translation request and try to resolve it later dpp.setTranslationRequest(translation_req); - dpp.tc = cache->system->threads[translation_req->contextId()]; + dpp.tc = system->threads[translation_req->contextId()]; DPRINTF(HWPrefetch, "Prefetch queued with no translation. " "addr:%#x priority: %3d\n", new_pfi.getAddr(), priority); addToQueue(pfqMissingTranslation, dpp); @@ -605,10 +605,10 @@ Queued::addToQueue(std::list &queue, if (dpp.pfahead) { // l1 can not process l3 pfahead request // but l3 can process l1 request - if (dpp.pfahead_host > cache->level()) { - panic("Prefetch req from src %i heading to l%i, but l%i can not process it\n", - dpp.pfInfo.getXsMetadata().prefetchSource, dpp.pfahead_host, cache->level()); - } + // if (dpp.pfahead_host > cache->level()) { + // panic("Prefetch req from src %i heading to l%i, but l%i can not process it\n", + // dpp.pfInfo.getXsMetadata().prefetchSource, dpp.pfahead_host, cache->level()); + // } } queue_size = queueSize; queue_name = "PFQ"; diff --git a/src/mem/cache/prefetch/sms.cc b/src/mem/cache/prefetch/sms.cc index 0d19d4f3b3..424a52eeb6 100644 --- a/src/mem/cache/prefetch/sms.cc +++ b/src/mem/cache/prefetch/sms.cc @@ -23,7 +23,7 @@ XSCompositePrefetcher::XSCompositePrefetcher(const XSCompositePrefetcherParams & p.pht_replacement_policy, PhtEntry(2 * (regionBlks - 1), SatCounter8(3, 2))), phtPFAhead(p.pht_pf_ahead), - phtPFLevel(std::min(p.pht_pf_level, (int) maxCacheLevel)), + phtPFLevel(std::min(p.pht_pf_level, (int) 3)), stats(this), pfBlockLRUFilter(pfFilterSize), pfPageLRUFilter(pfPageFilterSize), @@ -134,10 +134,9 @@ XSCompositePrefetcher::calculatePrefetch(const PrefetchInfo &pfi, std::vector= 3) { - Addr pf_tgt_addr_l3 = decr ? pf_tgt_addr - 256 * blkSize : pf_tgt_addr + 256 * blkSize; // depth here? - sendStreamPF(pfi, pf_tgt_addr_l3, addresses, pfPageLRUFilterL3, decr, 3); - } + + Addr pf_tgt_addr_l3 = decr ? pf_tgt_addr - 256 * blkSize : pf_tgt_addr + 256 * blkSize; // depth here? + sendStreamPF(pfi, pf_tgt_addr_l3, addresses, pfPageLRUFilterL3, decr, 3); } } @@ -640,21 +639,21 @@ XSCompositePrefetcher::XSCompositeStats::XSCompositeStats(statistics::Group *par } void -XSCompositePrefetcher::setCache(BaseCache *_cache) +XSCompositePrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) { - Base::setCache(_cache); + Base::setParentInfo(sys, pm, _cache, blk_size); - largeBOP->setCache(_cache); - smallBOP->setCache(_cache); - learnedBOP->setCache(_cache); + largeBOP->setParentInfo(sys, pm, _cache, blk_size); + smallBOP->setParentInfo(sys, pm, _cache, blk_size); + learnedBOP->setParentInfo(sys, pm, _cache, blk_size); - berti->setCache(_cache); + berti->setParentInfo(sys, pm, _cache, blk_size); if (cmc) - cmc->setCache(_cache); + cmc->setParentInfo(sys, pm, _cache, blk_size); if (ipcp) - ipcp->setCache(_cache); + ipcp->setParentInfo(sys, pm, _cache, blk_size); } } // prefetch diff --git a/src/mem/cache/prefetch/sms.hh b/src/mem/cache/prefetch/sms.hh index cd5f59170e..83ca0bc1c5 100644 --- a/src/mem/cache/prefetch/sms.hh +++ b/src/mem/cache/prefetch/sms.hh @@ -203,7 +203,7 @@ class XSCompositePrefetcher : public Queued hintDownStream->notifyIns(ins_num); } } - void setCache(BaseCache *_cache) override; + void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override; }; } diff --git a/src/mem/cache/prefetch/worker.hh b/src/mem/cache/prefetch/worker.hh index 7e69b5e8a7..f94cca02f7 100644 --- a/src/mem/cache/prefetch/worker.hh +++ b/src/mem/cache/prefetch/worker.hh @@ -51,11 +51,12 @@ class WorkerPrefetcher : public Queued void rxHint(BaseMMU::Translation *dpp) override; std::pair rxMembusRatio(RequestorID requestorId) override { - long totalMissCount = cache->stats.cmd[MemCmd::ReadExReq]->misses.total() + - cache->stats.cmd[MemCmd::ReadSharedReq]->misses.total(); - long missCount = cache->stats.cmd[MemCmd::ReadExReq]->misses[requestorId].value() + - cache->stats.cmd[MemCmd::ReadSharedReq]->misses[requestorId].value(); - return std::pair(missCount, totalMissCount); + // long totalMissCount = cache->stats.cmd[MemCmd::ReadExReq]->misses.total() + + // cache->stats.cmd[MemCmd::ReadSharedReq]->misses.total(); + // long missCount = cache->stats.cmd[MemCmd::ReadExReq]->misses[requestorId].value() + + // cache->stats.cmd[MemCmd::ReadSharedReq]->misses[requestorId].value(); + // return std::pair(missCount, totalMissCount); + return std::pair(0, 0); }; void notifyIns(int ins_num) override { diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript index ae2e20c4a3..4fc3019ae4 100644 --- a/src/mem/ruby/SConscript +++ b/src/mem/ruby/SConscript @@ -93,6 +93,7 @@ def MakeInclude(source): MakeInclude('slicc_interface/AbstractCacheEntry.hh') MakeInclude('slicc_interface/Message.hh') MakeInclude('slicc_interface/RubyRequest.hh') +MakeInclude('slicc_interface/XsPFMetaData.hh') # External types MakeInclude('common/Address.hh') diff --git a/src/mem/ruby/network/BasicLink.py b/src/mem/ruby/network/BasicLink.py index a275d9bd85..4515506c32 100644 --- a/src/mem/ruby/network/BasicLink.py +++ b/src/mem/ruby/network/BasicLink.py @@ -54,7 +54,7 @@ class BasicExtLink(BasicLink): ext_node = Param.RubyController("External node") int_node = Param.BasicRouter("ID of internal node") - bandwidth_factor = 16 # only used by simple network + bandwidth_factor = 64 # only used by simple network class BasicIntLink(BasicLink): @@ -70,4 +70,4 @@ class BasicIntLink(BasicLink): dst_inport = Param.String("", "Inport direction at dst router") # only used by simple network - bandwidth_factor = 16 + bandwidth_factor = 64 diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc index 9a4439a538..a9ecbf163a 100644 --- a/src/mem/ruby/network/MessageBuffer.cc +++ b/src/mem/ruby/network/MessageBuffer.cc @@ -48,6 +48,7 @@ #include "base/stl_helpers.hh" #include "debug/RubyQueue.hh" #include "mem/ruby/system/RubySystem.hh" +#include "mem/ruby/system/Sequencer.hh" namespace gem5 { @@ -355,6 +356,32 @@ MessageBuffer::unregisterDequeueCallback() m_dequeue_callback = nullptr; } +void +MessageBuffer::notifyMissCallback(Tick current_time, Sequencer& sequencer) +{ + int num_readys = 0; + for (auto& msg : m_prio_heap) { + if (msg->getLastEnqueueTime() <= current_time) { + num_readys++; + auto req = dynamic_cast(msg.get()); + sequencer.TBEFullCancel(req->m_LineAddress); + } + } + DPRINTF(RubyQueue, "MessageBuffer: has %d readys but not dequeue, need notifyMissCallback\n", num_readys); +} + +bool +MessageBuffer::hasPrefetchRequest(Addr addr) +{ + for (auto& msg : m_prio_heap) { + auto req = dynamic_cast(msg.get()); + if (req->m_LineAddress == makeLineAddress(addr) && req->m_Prefetch == PrefetchBit_Yes) { + return true; + } + } + return false; +} + void MessageBuffer::clear() { @@ -522,6 +549,19 @@ MessageBuffer::isReady(Tick current_time) const (m_dequeues_this_cy < m_max_dequeue_rate); bool is_ready = (m_prio_heap.size() > 0) && (m_prio_heap.front()->getLastEnqueueTime() <= current_time); + + if (debug::RubyQueue) { + int num_readys = 0; + for (auto& msg : m_prio_heap) + { + if (msg->getLastEnqueueTime() <= current_time) + { + num_readys ++; + } + } + DPRINTF(RubyQueue, "MessageBuffer: has %d readys\n", num_readys); + } + if (!can_dequeue && is_ready) { // Make sure the Consumer executes next cycle to dequeue the ready msg m_consumer->scheduleEvent(Cycles(1)); diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh index 03a0454433..b44517ba4a 100644 --- a/src/mem/ruby/network/MessageBuffer.hh +++ b/src/mem/ruby/network/MessageBuffer.hh @@ -71,6 +71,8 @@ namespace gem5 namespace ruby { +class Sequencer; + class MessageBuffer : public SimObject { public: @@ -145,6 +147,10 @@ class MessageBuffer : public SimObject void registerDequeueCallback(std::function callback); void unregisterDequeueCallback(); + void notifyMissCallback(Tick current_time, Sequencer& sequencer); + + bool hasPrefetchRequest(Addr addr); + void recycle(Tick current_time, Tick recycle_latency); bool isEmpty() const { return m_prio_heap.size() == 0; } bool isStallMapEmpty() { return m_stall_msg_map.size() == 0; } diff --git a/src/mem/ruby/network/simple/SimpleNetwork.py b/src/mem/ruby/network/simple/SimpleNetwork.py index e52333b24d..ea0dfcecee 100644 --- a/src/mem/ruby/network/simple/SimpleNetwork.py +++ b/src/mem/ruby/network/simple/SimpleNetwork.py @@ -55,7 +55,7 @@ class SimpleNetwork(RubyNetwork): "default internal buffer size for links and\ routers; 0 indicates infinite buffering", ) - endpoint_bandwidth = Param.Int(1000, "bandwidth adjustment factor") + endpoint_bandwidth = Param.Int(2048, "bandwidth adjustment factor") physical_vnets_channels = VectorParam.Int( [], diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm index 444111a1b2..881b8b8b76 100644 --- a/src/mem/ruby/protocol/RubySlicc_Types.sm +++ b/src/mem/ruby/protocol/RubySlicc_Types.sm @@ -64,16 +64,6 @@ structure(OutPort, external = "yes", primitive="yes") { bool isDeferredMsgMapEmpty(Addr addr); } -structure(InPort, external = "yes", primitive="yes") { - bool isReady(Tick current_time); - Tick dequeue(Tick current_time); - void recycle(Tick current_time, Tick recycle_latency); - bool isEmpty(); - bool isStallMapEmpty(); - int getStallMapSize(); - bool hasStalledMsg(Addr addr); -} - external_type(NodeID, default="0", primitive="yes"); external_type(MachineID); @@ -133,6 +123,7 @@ structure (Sequencer, external = "yes") { Cycles, Cycles, Cycles); void notifyMissCallback(Addr, bool, bool); + void TBEFullCancel(Addr); void writeCallback(Addr, DataBlock); void writeCallback(Addr, DataBlock, bool); @@ -166,11 +157,27 @@ structure (Sequencer, external = "yes") { bool checkResourceAvailable(CacheResourceType, Addr); } +structure(InPort, external = "yes", primitive="yes") { + bool isReady(Tick current_time); + Tick dequeue(Tick current_time); + void recycle(Tick current_time, Tick recycle_latency); + bool isEmpty(); + bool isStallMapEmpty(); + int getStallMapSize(); + bool hasStalledMsg(Addr addr); + void notifyMissCallback(Tick current_time, Sequencer sequencer); +} + structure (HTMSequencer, interface="Sequencer", external = "yes") { // hardware transactional memory void htmCallback(Addr, HtmCallbackMode, HtmFailedInCacheReason); } +structure (XsPFMetaData, desc="...", external="yes") { + bool validXsMetadata, desc=""; + int prefetchDepth, desc=""; +} + structure(RubyRequest, desc="...", interface="Message", external="yes") { Addr LineAddress, desc="Line address for this request"; Addr PhysicalAddress, desc="Physical address for this request"; @@ -193,6 +200,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { bool isSLCSet, default="false",desc="If flag is set, bypass GPU L1 and L2 caches"; RequestPtr getRequestPtr(); + XsPFMetaData getXsPFMeta(); } structure(AbstractCacheEntry, primitive="yes", external = "yes") { @@ -230,6 +238,7 @@ structure (CacheMemory, external = "yes") { void htmCommitTransaction(); void htmAbortTransaction(); + int level(); int getCacheSize(); int getNumBlocks(); Addr getAddressAtIdx(int); @@ -266,11 +275,14 @@ structure (RubyPrefetcher, external = "yes") { } structure(RubyPrefetcherProxy, external = "yes") { - void notifyPfHit(RequestPtr, bool, DataBlock); - void notifyPfMiss(RequestPtr, bool, DataBlock); + void notifyPfHit(RequestPtr, bool, XsPFMetaData, DataBlock); + void notifyPfMiss(RequestPtr, bool, XsPFMetaData, DataBlock); void notifyPfFill(RequestPtr, DataBlock, bool); - void notifyPfEvict(Addr, bool, RequestorID); + void notifyPfEvict(Addr, bool, XsPFMetaData, RequestorID); void completePrefetch(Addr); + void pfHitInCache(XsPFMetaData); + void notifyHitToDownStream(RequestPtr); + void offloadToDownStream(); // SLICC controller must define its own regProbePoints and call // this for every RubyPrefetcherProxy object present void regProbePoints(); diff --git a/src/mem/ruby/protocol/RubySlicc_Util.sm b/src/mem/ruby/protocol/RubySlicc_Util.sm index 104c7c034c..2cbbbb1435 100644 --- a/src/mem/ruby/protocol/RubySlicc_Util.sm +++ b/src/mem/ruby/protocol/RubySlicc_Util.sm @@ -40,6 +40,7 @@ // Miscallaneous Functions +void warn(std::string msg); void error(std::string msg); void assert(bool condition); Cycles zero_time(); @@ -61,3 +62,6 @@ structure(BoolVec, external="yes") { } int countBoolVec(BoolVec bVec); RequestorID getRequestorID(RequestPtr req); +XsPFMetaData getRequestXsMetaData(RequestPtr req); +void setRequestXsMetaData(RequestPtr req, XsPFMetaData pfmeta); +bool XsMetaIsNotNull(XsPFMetaData pfmeta); diff --git a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm index f85a1aca6e..cb55052602 100644 --- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm +++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm @@ -144,6 +144,7 @@ action(AllocateTBE_SeqRequest, desc="") { out_msg.seqReq := in_msg.getRequestPtr(); out_msg.isSeqReqValid := true; assert(in_msg.Prefetch == PrefetchBit:No); + out_msg.is_from_cpu := true; out_msg.is_local_pf := false; out_msg.is_remote_pf := false; out_msg.txnId := max_outstanding_transactions; @@ -192,6 +193,7 @@ action(AllocateTBE_SeqDvmRequest, desc="") { out_msg.accAddr := in_msg.tlbiTransactionUid; out_msg.accSize := blockSize; assert(in_msg.Prefetch == PrefetchBit:No); + out_msg.is_from_cpu := true; out_msg.is_local_pf := false; out_msg.is_remote_pf := false; @@ -236,6 +238,7 @@ action(AllocateTBE_PfRequest, desc="Allocate TBE for prefetch request") { assert(in_msg.Prefetch != PrefetchBit:No); out_msg.is_local_pf := true; out_msg.is_remote_pf := false; + out_msg.pfmeta := in_msg.getXsPFMeta(); out_msg.txnId := max_outstanding_transactions; if (in_msg.Type == RubyRequestType:LD) { @@ -248,6 +251,10 @@ action(AllocateTBE_PfRequest, desc="Allocate TBE for prefetch request") { } } pfInPort.dequeue(clockEdge()); + if (!storTBEs.areNSlotsAvailable(1)) + { + pfProxy.offloadToDownStream(); + } } action(Initiate_Request, desc="") { @@ -788,7 +795,11 @@ action(Initiate_LoadHit, desc="") { } action(Initiate_LoadMiss, desc="") { - if (tbe.doCacheFill) { + if (is_HN) { + tbe.actions.push(Event:SendReadNoSnp); + tbe.actions.push(Event:CheckCacheFill); + tbe.actions.push(Event:TagArrayWrite); + } else if (tbe.doCacheFill) { tbe.actions.push(Event:SendReadShared); tbe.actions.push(Event:CheckCacheFill); tbe.actions.push(Event:TagArrayWrite); @@ -1543,7 +1554,7 @@ action(Send_ReadShared, desc="") { action(Send_ReadNoSnp, desc="") { assert(is_HN); - assert((tbe.use_DMT == false) || + assert((tbe.use_DMT == false || tbe.is_local_pf) || ((tbe.reqType == CHIRequestType:AtomicReturn) || (tbe.reqType == CHIRequestType:AtomicNoReturn))); @@ -2265,7 +2276,7 @@ action(UpdateDirState_FromReqDataResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDirState_FromReqDataResp", tbe); } action(UpdateDirState_FromSnpDataResp, desc="") { @@ -2338,7 +2349,7 @@ action(UpdateDirState_FromSnpDataResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDirState_FromSnpDataResp", tbe); } action(UpdateDataState_FromReqDataResp, desc="") { @@ -2420,7 +2431,7 @@ action(UpdateDataState_FromReqDataResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDataState_FromReqDataResp", tbe); } action(UpdateDataState_FromWUDataResp, desc="") { @@ -2442,7 +2453,7 @@ action(UpdateDataState_FromWUDataResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDataState_FromWUDataResp", tbe); } action(UpdateDataState_FromADataResp, desc="") { @@ -2463,7 +2474,7 @@ action(UpdateDataState_FromADataResp, desc="") { (tbe.reqType == CHIRequestType:AtomicNoReturn))){ tbe.dataMaybeDirtyUpstream := false; } - printTBEState(tbe); + printTBEState("UpdateDataState_FromADataResp", tbe); } action(UpdateDataState_FromCUResp, desc="") { @@ -2475,7 +2486,7 @@ action(UpdateDataState_FromCUResp, desc="") { // self and upstream may have been invalidated while waiting for this // expect to follow up with a ReadUnique } - printTBEState(tbe); + printTBEState("UpdateDataState_FromCUResp", tbe); } action(UpdateDataState_FromSnpDataResp, desc="") { @@ -2521,7 +2532,7 @@ action(UpdateDataState_FromSnpDataResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDataState_FromSnpDataResp", tbe); } action(UpdateDirState_FromReqResp, desc="") { @@ -2549,7 +2560,7 @@ action(UpdateDirState_FromReqResp, desc="") { } } } - printTBEState(tbe); + printTBEState("UpdateDirState_FromReqResp", tbe); } action(UpdateDirState_FromSnpResp, desc="") { @@ -2611,7 +2622,7 @@ action(UpdateDirState_FromSnpResp, desc="") { tbe.dataMaybeDirtyUpstream := tbe.dir_ownerExists; } - printTBEState(tbe); + printTBEState("UpdateDirState_FromSnpResp", tbe); } action(Receive_ReqResp, desc="") { @@ -2777,7 +2788,7 @@ action(Send_CompData, desc="") { tbe.snd_destination := tbe.requestor; setupPendingSend(tbe); - printTBEState(tbe); + printTBEState("Send_CompData", tbe); } action(Send_WBData, desc="") { @@ -3250,7 +3261,7 @@ action(Send_CompData_AR, desc="") { tbe.requestorToBeOwner := false; tbe.snd_destination := tbe.requestor; setupPendingSend(tbe); - printTBEState(tbe); + printTBEState("Send_CompData_AR", tbe); } @@ -3582,23 +3593,31 @@ action(Callback_LoadHit, desc="") { } } +action(Callback_PrefetchLate, desc="load miss due to prefetch late") { + assert(is_valid(tbe)); + if (is_dcache && tbe.is_from_cpu && tbe.is_local_pf && use_prefetcher) { + XsPFMetaData late_in_tbe := tbe.pfmeta; + pfProxy.notifyPfMiss(tbe.seqReq, true, late_in_tbe, tbe.dataBlk); + } +} + action(Callback_DelayedL1DResult_Load, desc="L1D load delayed due to load miss") { assert(is_valid(tbe)); - if (is_dcache) { + if (is_dcache && tbe.is_from_cpu) { sequencer.notifyMissCallback(tbe.addr, false, false); } } action(Callback_DelayedL1DResult_Store, desc="L1D store delayed due to store") { assert(is_valid(tbe)); - if (is_dcache) { + if (is_dcache && tbe.is_from_cpu) { sequencer.notifyMissCallback(tbe.addr, true, false); } } action(Callback_DelayedL1DResult_Busy, desc="L1D transaction delayed due to busy") { assert(is_valid(tbe)); - if (is_dcache) { + if (is_dcache && tbe.is_from_cpu) { if (tbe.reqType == CHIRequestType:Load) { sequencer.notifyMissCallback(tbe.addr, false, true); } else if (tbe.reqType == CHIRequestType:Store) { @@ -3650,6 +3669,7 @@ action(Callback_ExpressPrefetchHit, desc="") { cache.profilePrefetchHit(); peek(reqRdyPort, CHIRequestMsg) { assert(in_msg.is_local_pf); + pfProxy.pfHitInCache(in_msg.pfmeta); pfProxy.completePrefetch(in_msg.addr); } } @@ -3763,7 +3783,7 @@ action(Profile_Miss, desc="") { cache.profilePrefetchMiss(); } // notify prefetcher about this demand miss - if (use_prefetcher && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) { + if (use_prefetcher && !isAtomicReqType(tbe.reqType) && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) { bool is_read := false; if (isReadReqType(tbe.reqType)) { is_read := true; @@ -3773,7 +3793,8 @@ action(Profile_Miss, desc="") { // FIXME: this dataBlk is likely to have stale data. This should be fixed // if our prefetcher uses cached data to make prefetch decisions. - pfProxy.notifyPfMiss(tbe.seqReq, is_read, tbe.dataBlk); + XsPFMetaData empty; + pfProxy.notifyPfMiss(tbe.seqReq, is_read, empty, tbe.dataBlk); } } @@ -3790,14 +3811,15 @@ action(Profile_Hit, desc="") { cache.profilePrefetchHit(); } // notify prefetcher about this demand hit - if (use_prefetcher && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) { + if (use_prefetcher && !isAtomicReqType(tbe.reqType) && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) { bool is_read := false; if (isReadReqType(tbe.reqType)) { is_read := true; } else { assert(isWriteReqType(tbe.reqType)); } - pfProxy.notifyPfHit(tbe.seqReq, is_read, tbe.dataBlk); + pfProxy.notifyPfHit(tbe.seqReq, is_read, cache_entry.pfmeta, tbe.dataBlk); + pfProxy.notifyHitToDownStream(tbe.seqReq); cache_entry.HWPrefetched := false; } @@ -3807,11 +3829,16 @@ action(Profile_Fill, desc="") { assert(is_valid(tbe)); assert(is_valid(cache_entry)); if (use_prefetcher && tbe.isSeqReqValid) { - cache_entry.HWPrefetched := tbe.is_local_pf || - (tbe.is_remote_pf && - (upstream_prefetch_trains_prefetcher == false)); + (tbe.is_remote_pf && upstream_prefetch_trains_prefetcher); + cache_entry.everPrefetched := cache_entry.HWPrefetched; cache_entry.requestor := getRequestorID(tbe.seqReq); + if (cache_entry.HWPrefetched) { + cache_entry.pfmeta := getRequestXsMetaData(tbe.seqReq); + } else { + XsPFMetaData empty; + cache_entry.pfmeta := empty; + } // Prefetchers that use this info require notifications from both // demand and pf fills (unlike notifyPfHit/notifyPfMiss) @@ -3829,7 +3856,7 @@ action(Profile_Eviction, desc="") { sequencer.evictionCallback(address); } if (use_prefetcher && is_valid(cache_entry)) { - pfProxy.notifyPfEvict(address, cache_entry.HWPrefetched, cache_entry.requestor); + pfProxy.notifyPfEvict(address, cache_entry.HWPrefetched, cache_entry.pfmeta, cache_entry.requestor); } } diff --git a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm index d2272d341e..2357fed9f9 100644 --- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm +++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm @@ -316,6 +316,10 @@ bool inCache(Addr addr, bool is_secure) { } } +int level() { + return cache.level(); +} + bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) { CacheEntry entry := getCacheEntry(makeLineAddress(addr)); if (is_valid(entry)) { @@ -334,6 +338,25 @@ bool hasBeenPrefetched(Addr addr, bool is_secure) { } } +bool hasEverBeenPrefetched(Addr addr, bool is_secure) { + CacheEntry entry := getCacheEntry(makeLineAddress(addr)); + if (is_valid(entry)) { + return entry.everPrefetched; + } else { + return false; + } +} + +XsPFMetaData getHitBlkXsMetadata(Addr addr, bool is_secure) { + CacheEntry entry := getCacheEntry(makeLineAddress(addr)); + XsPFMetaData empty; + if (is_valid(entry)) { + return entry.pfmeta; + } else { + return empty; + } +} + bool inMissQueue(Addr addr, bool is_secure) { Addr line_addr := makeLineAddress(addr); TBE tbe := getCurrentActiveTBE(line_addr); @@ -344,6 +367,16 @@ bool coalesce() { return false; } +AbstractCacheEntry CacheEntryToAbstract(CacheEntry entry), return_by_pointer="yes"; + +AbstractCacheEntry findBlock(Addr addr, bool is_secure), return_by_pointer="yes" { + CacheEntry entry := getCacheEntry(makeLineAddress(addr)); + if (is_valid(entry)) { + return CacheEntryToAbstract(entry); + } + return OOD; +} + void notifyCoalesced(Addr addr, RubyRequestType type, RequestPtr req, DataBlock data_blk, bool was_miss) { DPRINTF(RubySlicc, "notifyCoalesced(addr=%#x, type=%s, was_miss=%d)\n", @@ -357,10 +390,11 @@ void notifyCoalesced(Addr addr, RubyRequestType type, RequestPtr req, bool is_read := (type == RubyRequestType:LD) || (type == RubyRequestType:Load_Linked) || (type == RubyRequestType:IFETCH); + XsPFMetaData empty; if (was_miss) { - pfProxy.notifyPfMiss(req, is_read, data_blk); + pfProxy.notifyPfMiss(req, is_read, empty, data_blk); } else { - pfProxy.notifyPfHit(req, is_read, data_blk); + pfProxy.notifyPfHit(req, is_read, empty, data_blk); } } } @@ -444,8 +478,10 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes" tbe.isSeqReqValid := in_msg.isSeqReqValid; tbe.seqReq := in_msg.seqReq; + tbe.is_from_cpu := in_msg.is_from_cpu; tbe.is_local_pf := in_msg.is_local_pf; tbe.is_remote_pf := in_msg.is_remote_pf; + tbe.pfmeta := in_msg.pfmeta; tbe.atomic_op.clear(); tbe.atomic_op.orMask(in_msg.atomic_op); @@ -485,6 +521,7 @@ TBE allocateDvmRequestTBE(Addr txnId, CHIRequestMsg in_msg), return_by_pointer=" tbe.isSeqReqValid := in_msg.isSeqReqValid; tbe.seqReq := in_msg.seqReq; + tbe.is_from_cpu := in_msg.is_from_cpu; tbe.is_local_pf := in_msg.is_local_pf; tbe.is_remote_pf := in_msg.is_remote_pf; @@ -842,14 +879,18 @@ bool upstreamHasShared(State state) { (state == State:UC_RSC) || (state == State:SC_RSC); } -void printTBEState(TBE tbe) { - DPRINTF(RubySlicc, "STATE: addr: %#x data present=%d valid=%d unique=%d dirty=%d mu_dirty=%d dir ownerV=%d ownerE=%d sharers=%d tobe_I=%d tobe_SC=%d doFill=%d pendAction=%s txnId=%d\n", - tbe.addr, tbe.dataBlkValid.isFull(), tbe.dataValid, tbe.dataUnique, - tbe.dataDirty, tbe.dataMaybeDirtyUpstream, tbe.dir_ownerExists, - tbe.dir_ownerIsExcl,tbe.dir_sharers.count(), - tbe.dataToBeInvalid, tbe.dataToBeSharedClean, - tbe.doCacheFill, tbe.pendAction, tbe.txnId); - DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid); +void printTBEState(std::string pos, TBE tbe) { + if (is_valid(tbe)) { + DPRINTF(RubySlicc, "At %s: STATE: addr: %#x data present=%d valid=%d unique=%d dirty=%d mu_dirty=%d dir ownerV=%d ownerE=%d sharers=%d tobe_I=%d tobe_SC=%d doFill=%d pendAction=%s txnId=%d\n", + pos, tbe.addr, tbe.dataBlkValid.isFull(), tbe.dataValid, tbe.dataUnique, + tbe.dataDirty, tbe.dataMaybeDirtyUpstream, tbe.dir_ownerExists, + tbe.dir_ownerIsExcl,tbe.dir_sharers.count(), + tbe.dataToBeInvalid, tbe.dataToBeSharedClean, + tbe.doCacheFill, tbe.pendAction, tbe.txnId); + } else { + DPRINTF(RubySlicc, "At %s: invalid tbe\n", pos); + } + // DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid); } void printDvmTBEState(TBE tbe) { @@ -945,7 +986,7 @@ void copyCacheAndDir(CacheEntry cache_entry, DirEntry dir_entry, tbe.dataToBeSharedClean := false; tbe.dataToBeInvalid := false; - printTBEState(tbe); + printTBEState("copyCacheAndDir", tbe); } void copyCacheAndDirTBEs(TBE src, TBE dst) { @@ -961,7 +1002,7 @@ void copyCacheAndDirTBEs(TBE src, TBE dst) { dst.dir_owner := src.dir_owner; dst.dir_ownerExists := src.dir_ownerExists; dst.dir_ownerIsExcl := src.dir_ownerIsExcl; - printTBEState(dst); + printTBEState("copyCacheAndDirTBEs", dst); } void deallocateReqTBE(TBE tbe) { @@ -1093,7 +1134,7 @@ State makeFinalStateHelper(State cs, State ds) { State makeFinalState(TBE tbe, CacheEntry cache_entry, DirEntry dir_entry) { setDataToBeStates(tbe); - printTBEState(tbe); + printTBEState("makeFinalState", tbe); State cache_state := State:I; State dir_state := State:I; @@ -1207,6 +1248,14 @@ bool isWriteReqType(CHIRequestType type) { return false; } +bool isAtomicReqType(CHIRequestType type) { + if (type == CHIRequestType:AtomicLoad || + type == CHIRequestType:AtomicStore) { + return true; + } + return false; +} + bool isStashReqType(CHIRequestType type) { if (type == CHIRequestType:StashOnceShared || type == CHIRequestType:StashOnceUnique) { diff --git a/src/mem/ruby/protocol/chi/CHI-cache-ports.sm b/src/mem/ruby/protocol/chi/CHI-cache-ports.sm index 8bb76fdb25..c4e84a4189 100644 --- a/src/mem/ruby/protocol/chi/CHI-cache-ports.sm +++ b/src/mem/ruby/protocol/chi/CHI-cache-ports.sm @@ -318,6 +318,9 @@ in_port(reqRdyPort, CHIRequestMsg, reqRdy, rank=3, } else { CacheEntry cache_entry := getCacheEntry(in_msg.addr); TBE tbe := getCurrentActiveTBE(in_msg.addr); + if (is_valid(tbe)) { + tbe.is_from_cpu := in_msg.is_from_cpu; + } DirEntry dir_entry := getDirEntry(in_msg.addr); @@ -421,6 +424,9 @@ in_port(seqInPort, RubyRequest, mandatoryQueue, rank=1) { ); } } else { + if (!storTBEs.areNSlotsAvailable(1) || !reqRdy.areNSlotsAvailable(1, curTick())) { + seqInPort.notifyMissCallback(clockEdge(), sequencer); + } trigger(Event:AllocSeqRequest, in_msg.LineAddress, getCacheEntry(in_msg.LineAddress), getCurrentActiveTBE(in_msg.LineAddress)); @@ -482,7 +488,7 @@ void processNextState(Addr address, TBE tbe, CacheEntry cache_entry) { } } - printTBEState(tbe); + printTBEState("processNextState", tbe); // we might be going to BUSY_INTERRUPTABLE so wakeup pending snoops // if any diff --git a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm index 8d531e93c0..03bae6bdde 100644 --- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm +++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm @@ -968,6 +968,7 @@ transition({BUSY_BLKD,BUSY_INTR}, } transition({BUSY_BLKD,BUSY_INTR}, Load) { + Callback_PrefetchLate; Callback_DelayedL1DResult_Load; StallRequest; } diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm index fce153f35e..a10ca4e5b5 100644 --- a/src/mem/ruby/protocol/chi/CHI-cache.sm +++ b/src/mem/ruby/protocol/chi/CHI-cache.sm @@ -167,7 +167,7 @@ machine(MachineType:Cache, "Cache coherency protocol") : // If the responder has the line in UC or UD state, propagate this state // on a ReadShared. Notice data won't be deallocated if dealloc_on_unique is // set - bool fwd_unique_on_readshared := "False"; + bool fwd_unique_on_readshared := "True"; // Allow receiving data in SD state. bool allow_SD; @@ -584,8 +584,10 @@ machine(MachineType:Cache, "Cache coherency protocol") : structure(CacheEntry, interface="AbstractCacheEntry") { State state, desc="SLICC line state"; DataBlock DataBlk, desc="data for the block"; + bool everPrefetched, default="false", desc="Set if this cache entry was ever prefetched"; bool HWPrefetched, default="false", desc="Set if this cache entry was prefetched"; RequestorID requestor, desc="First requestor to fill this block"; + XsPFMetaData pfmeta, desc="Xs prefetch meta data"; } // Directory entry @@ -665,8 +667,10 @@ machine(MachineType:Cache, "Cache coherency protocol") : // if either is set prefetchers are not notified on miss/hit/fill and // demand hit/miss stats are not incremented + bool is_from_cpu, default="false", dest="Request generated by CPU"; bool is_local_pf, desc="Request generated by a local prefetcher"; bool is_remote_pf, desc="Request generated a prefetcher in another cache"; + XsPFMetaData pfmeta, desc="Request generated by a local prefetcher, prefetch metadata"; // Atomic info associated with the transaction WriteMask atomic_op, desc="Atomic Operation Wrapper"; diff --git a/src/mem/ruby/protocol/chi/CHI-msg.sm b/src/mem/ruby/protocol/chi/CHI-msg.sm index ce0d3d88cf..5369e5a8f7 100644 --- a/src/mem/ruby/protocol/chi/CHI-msg.sm +++ b/src/mem/ruby/protocol/chi/CHI-msg.sm @@ -115,8 +115,10 @@ structure(CHIRequestMsg, desc="", interface="Message") { RequestPtr seqReq, default="nullptr", desc="Pointer to original request from CPU/sequencer (nullptr if not valid)"; bool isSeqReqValid, default="false", desc="Set if seqReq is valid (not nullptr)"; + bool is_from_cpu, default="false", dest="Request generated by CPU"; bool is_local_pf, desc="Request generated by a local prefetcher"; bool is_remote_pf, desc="Request generated a prefetcher in another cache"; + XsPFMetaData pfmeta, desc="Request generated by a local prefetcher, prefetch meta"; WriteMask atomic_op, desc="Atomic Operation Wrapper"; diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 0049c9d90c..63b11d61cc 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -73,6 +73,7 @@ namespace ruby class Network; class GPUCoalescer; class DMASequencer; +class AbstractCacheEntry; // used to communicate that an in_port peeked the wrong message type class RejectException: public std::exception @@ -358,19 +359,30 @@ class AbstractController : public ClockedObject, public Consumer, public HasDown virtual bool inCache(const Addr &addr, const bool &is_secure) { fatal("inCache: prefetching not supported"); return false; } + virtual int level() + { fatal("level: prefetching not supported"); return 0; } + virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure) { fatal("hasBeenPrefetched: prefetching not supported"); return false; } - virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure, - const RequestorID &requestor) + virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure, const RequestorID &requestor) { fatal("hasBeenPrefetched: prefetching not supported"); return false; } + virtual bool hasEverBeenPrefetched(const Addr &addr, const bool &is_secure) + { fatal("hasEverBeenPrefetched: prefetching not supported"); return false; } + + virtual Request::XsMetadata getHitBlkXsMetadata(const Addr &addr, const bool &is_secure) + { fatal("getHitBlkXsMetadata: prefetching not supported"); return Request::XsMetadata(); } + virtual bool inMissQueue(const Addr &addr, const bool &is_secure) { fatal("inMissQueue: prefetching not supported"); return false; } virtual bool coalesce() { fatal("coalesce: prefetching not supported"); return false; } + virtual AbstractCacheEntry* findBlock(const Addr &addr, const bool &is_secure) + { fatal("findBlock: prefetching not supported"); return nullptr; } + friend class RubyPrefetcherProxy; protected: diff --git a/src/mem/ruby/slicc_interface/Controller.py b/src/mem/ruby/slicc_interface/Controller.py index ef8a0afbf1..ead3327617 100644 --- a/src/mem/ruby/slicc_interface/Controller.py +++ b/src/mem/ruby/slicc_interface/Controller.py @@ -54,7 +54,7 @@ class RubyController(ClockedObject): cluster_id = Param.UInt32(0, "Id of this controller's cluster") transitions_per_cycle = Param.Int( - 32, "no. of SLICC state machine transitions per cycle" + 1024, "no. of SLICC state machine transitions per cycle" ) buffer_size = Param.UInt32(0, "max buffer size 0 means infinite") diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh index d0de3b1f13..0e90aa9ef8 100644 --- a/src/mem/ruby/slicc_interface/RubyRequest.hh +++ b/src/mem/ruby/slicc_interface/RubyRequest.hh @@ -51,6 +51,7 @@ #include "mem/ruby/protocol/PrefetchBit.hh" #include "mem/ruby/protocol/RubyAccessMode.hh" #include "mem/ruby/protocol/RubyRequestType.hh" +#include "mem/ruby/slicc_interface/XsPFMetaData.hh" namespace gem5 { @@ -225,6 +226,10 @@ class RubyRequest : public Message const int& getSize() const { return m_Size; } const PrefetchBit& getPrefetch() const { return m_Prefetch; } RequestPtr getRequestPtr() const { return m_pkt->req; } + XsPFMetaData getXsPFMeta() const + { + return m_pkt->req->getXsMetadata(); + } void setWriteMask(uint32_t offset, uint32_t len, std::vector< std::pair> atomicOps); diff --git a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh index 8df56c7013..5647270405 100644 --- a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh +++ b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh @@ -58,6 +58,7 @@ #include "mem/ruby/common/TypeDefines.hh" #include "mem/ruby/common/WriteMask.hh" #include "mem/ruby/protocol/RubyRequestType.hh" +#include "mem/ruby/slicc_interface/XsPFMetaData.hh" namespace gem5 { @@ -65,6 +66,8 @@ namespace gem5 namespace ruby { +class AbstractCacheEntry; + inline Cycles zero_time() { return Cycles(0); } inline Cycles intToCycles(int c) { return Cycles(c); } @@ -322,6 +325,30 @@ getRequestorID(RequestPtr req) return req->requestorId(); } +inline XsPFMetaData +getRequestXsMetaData(RequestPtr& req) +{ + return req->getXsMetadata(); +} + +inline void +setRequestXsMetaData(RequestPtr& req, XsPFMetaData& pfmeta) +{ + req->setXsMetadata(pfmeta); +} + +inline bool +XsMetaIsNotNull(XsPFMetaData& pfmeta) +{ + return pfmeta.prefetchSource != PrefetchSourceType::PF_NONE; +} + +inline AbstractCacheEntry* +CacheEntryToAbstract(AbstractCacheEntry* entry) +{ + return entry; +} + } // namespace ruby } // namespace gem5 diff --git a/src/mem/ruby/slicc_interface/XsPFMetaData.hh b/src/mem/ruby/slicc_interface/XsPFMetaData.hh new file mode 100644 index 0000000000..4c0f9d391d --- /dev/null +++ b/src/mem/ruby/slicc_interface/XsPFMetaData.hh @@ -0,0 +1,19 @@ +#ifndef __MEM_RUBY_SLICC_INTERFACE_XSPFMETADATA_HH__ +#define __MEM_RUBY_SLICC_INTERFACE_XSPFMETADATA_HH__ +#include "mem/request.hh" + +namespace gem5 +{ + +using XsPFMetaData = Request::XsMetadata; + +inline +std::ostream& operator<<(std::ostream& os, const XsPFMetaData& meta) +{ + os << "pfsource[" << meta.prefetchSource << "]"; + return os; +} + +} + +#endif diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index 90d67fb29b..b98d1c8316 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -77,6 +77,7 @@ CacheMemory::CacheMemory(const Params &p) p.ruby_system->clockPeriod()), cacheMemoryStats(this) { + m_level = p.level; m_cache_size = p.size; m_cache_assoc = p.assoc; m_replacementPolicy_ptr = p.replacement_policy; diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh index 58e8722a00..761aa6ad5e 100644 --- a/src/mem/ruby/structures/CacheMemory.hh +++ b/src/mem/ruby/structures/CacheMemory.hh @@ -155,6 +155,7 @@ class CacheMemory : public SimObject void htmCommitTransaction(); public: + int level() const { return m_level; } int getCacheSize() const { return m_cache_size; } int getCacheAssoc() const { return m_cache_assoc; } int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; } @@ -189,6 +190,7 @@ class CacheMemory : public SimObject BankedArray tagArray; ALUFreeListArray atomicALUArray; + int m_level; int m_cache_size; int m_cache_num_sets; int m_cache_num_set_bits; diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py index 2f457f5c4a..eb181f4609 100644 --- a/src/mem/ruby/structures/RubyCache.py +++ b/src/mem/ruby/structures/RubyCache.py @@ -35,6 +35,7 @@ class RubyCache(SimObject): cxx_class = "gem5::ruby::CacheMemory" cxx_header = "mem/ruby/structures/CacheMemory.hh" + level = Param.Int(-1, "the level of cache (l1 is 1, l2 is 2, etc.)") size = Param.MemorySize("capacity in bytes") assoc = Param.Int("") replacement_policy = Param.BaseReplacementPolicy(TreePLRURP(), "") diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.cc b/src/mem/ruby/structures/RubyPrefetcherProxy.cc index 80844da0d7..7e688d8066 100644 --- a/src/mem/ruby/structures/RubyPrefetcherProxy.cc +++ b/src/mem/ruby/structures/RubyPrefetcherProxy.cc @@ -38,6 +38,7 @@ #include "mem/ruby/structures/RubyPrefetcherProxy.hh" #include "debug/HWPrefetch.hh" +#include "mem/ruby/slicc_interface/RubySlicc_Util.hh" #include "mem/ruby/system/RubySystem.hh" namespace gem5 @@ -50,6 +51,7 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent, prefetch::Base* _prefetcher, MessageBuffer *_pf_queue) :Named(_parent->name()), + stat(_prefetcher), prefetcher(_prefetcher), cacheCntrl(_parent), pfQueue(_pf_queue), @@ -63,16 +65,22 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent, fatal_if(!pfQueue, "%s initializing a RubyPrefetcherProxy without a prefetch queue", name()); - // prefetcher->setParentInfo( - // cacheCntrl->params().system, - // cacheCntrl->getProbeManager(), - // RubySystem::getBlockSizeBytes()); - // Cannot do this below: RPP is not SimObject - // block size is same as system! - prefetcher->setPMInfoDirty(cacheCntrl->getProbeManager()); + prefetcher->setParentInfo( + cacheCntrl->params().system, + cacheCntrl->getProbeManager(), + this, + RubySystem::getBlockSizeBytes()); } } +RubyPrefetcherProxy::PfProxyStat::PfProxyStat(statistics::Group *parent) + : statistics::Group(parent, "RubyPrefetcherProxy"), + ADD_STAT(notifymiss,""), + ADD_STAT(notifyhit,""), + ADD_STAT(issuedPf,"") +{ +} + void RubyPrefetcherProxy::scheduleNextPrefetch() { @@ -124,9 +132,7 @@ RubyPrefetcherProxy::issuePrefetch() pkt->getAddr(), line_addr, pkt->needsWritable()); - RubyRequestType req_type = pkt->needsWritable() ? - RubyRequestType_ST : RubyRequestType_LD; - + RubyRequestType req_type = RubyRequestType_LD; std::shared_ptr msg = std::make_shared(cacheCntrl->clockEdge(), pkt->getAddr(), @@ -136,10 +142,11 @@ RubyPrefetcherProxy::issuePrefetch() RubyAccessMode_Supervisor, pkt, PrefetchBit_Yes); - + assert(msg->getRequestPtr()->hasXsMetadata()); // enqueue request into prefetch queue to the cache pfQueue->enqueue(msg, cacheCntrl->clockEdge(), cacheCntrl->cyclesToTicks(Cycles(1))); + stat.issuedPf++; // track all pending PF requests issuedPfPkts[line_addr] = pkt; @@ -157,33 +164,51 @@ RubyPrefetcherProxy::issuePrefetch() } void -RubyPrefetcherProxy::notifyPfHit(const RequestPtr& req, bool is_read, +RubyPrefetcherProxy::notifyPfHit(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta, const DataBlock& data_blk) { assert(ppHit); assert(req); + stat.notifyhit++; Packet pkt(req, is_read ? Packet::makeReadCmd(req) : Packet::makeWriteCmd(req)); // NOTE: for now we only communicate physical address with prefetchers pkt.dataStaticConst(data_blk.getData(getOffset(req->getPaddr()), pkt.getSize())); DPRINTF(HWPrefetch, "notify hit: %s\n", pkt.print()); + pkt.missOnLatePf = false; + pkt.pfSource = pfmeta.prefetchSource; + pkt.pfDepth = pfmeta.prefetchDepth; ppHit->notify(&pkt); scheduleNextPrefetch(); } void -RubyPrefetcherProxy::notifyPfMiss(const RequestPtr& req, bool is_read, +RubyPrefetcherProxy::notifyPfMiss(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta, const DataBlock& data_blk) { assert(ppMiss); assert(req); - Packet pkt(req, is_read ? Packet::makeReadCmd(req) : - Packet::makeWriteCmd(req)); + stat.notifymiss++; + Packet pkt(req, is_read ? MemCmd::ReadReq : MemCmd::WriteReq); // NOTE: for now we only communicate physical address with prefetchers pkt.dataStaticConst(data_blk.getData(getOffset(req->getPaddr()), pkt.getSize())); DPRINTF(HWPrefetch, "notify miss: %s\n", pkt.print()); + pkt.missOnLatePf = (pfmeta.prefetchSource != PrefetchSourceType::PF_NONE); + pkt.pfSource = pfmeta.prefetchSource; + pkt.pfDepth = pfmeta.prefetchDepth; + if (!pkt.missOnLatePf && issuedPfPkts.count(makeLineAddress(req->getPaddr())) > 0) + { + auto pfpkt = issuedPfPkts[makeLineAddress(req->getPaddr())]; + pkt.missOnLatePf = true; + pkt.pfSource = pfpkt->req->getXsMetadata().prefetchSource; + pkt.pfDepth = pfpkt->req->getXsMetadata().prefetchDepth; + } + if (XsMetaIsNotNull(pfmeta)) { + prefetcher->pfHitInMSHR(pfmeta.prefetchSource); + } + prefetcher->incrDemandMhsrMisses(); ppMiss->notify(&pkt); scheduleNextPrefetch(); } @@ -207,19 +232,42 @@ RubyPrefetcherProxy::notifyPfFill(const RequestPtr& req, } void -RubyPrefetcherProxy::notifyPfEvict(Addr blkAddr, bool hwPrefetched, +RubyPrefetcherProxy::notifyPfEvict(Addr blkAddr, bool hwPrefetched, XsPFMetaData& pfmeta, RequestorID requestorID) { DPRINTF(HWPrefetch, "notify evict: %#x hw_pf=%d\n", blkAddr, hwPrefetched); // DataUpdate data_update( // blkAddr, false, requestorID, *this); // Maybe using the old DataUpdate here is enough + if (hwPrefetched) { + prefetcher->prefetchUnused(pfmeta.prefetchSource); + } DataUpdate data_update(blkAddr, false); // data_update.hwPrefetched = hwPrefetched; ppDataUpdate->notify(data_update); scheduleNextPrefetch(); } +void +RubyPrefetcherProxy::pfHitInCache(const XsPFMetaData& pfmeta) +{ + prefetcher->pfHitInCache(pfmeta.prefetchSource); +} + +void +RubyPrefetcherProxy::notifyHitToDownStream(const RequestPtr& req) +{ + // TODO: +} + +void +RubyPrefetcherProxy::offloadToDownStream() +{ + if (prefetcher->hasHintDownStream()) { + prefetcher->offloadToDownStream(); + } +} + void RubyPrefetcherProxy::regProbePoints() { diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.hh b/src/mem/ruby/structures/RubyPrefetcherProxy.hh index 79c311ba4d..47dce0b3d4 100644 --- a/src/mem/ruby/structures/RubyPrefetcherProxy.hh +++ b/src/mem/ruby/structures/RubyPrefetcherProxy.hh @@ -43,8 +43,11 @@ // #include "mem/cache/cache_probe_arg.hh" #include "mem/cache/base.hh" #include "mem/cache/prefetch/base.hh" +#include "mem/ruby/slicc_interface/AbstractCacheEntry.hh" #include "mem/ruby/slicc_interface/AbstractController.hh" #include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/slicc_interface/XsPFMetaData.hh" +#include "mem/ruby/system/RubySystem.hh" namespace gem5 { @@ -54,7 +57,7 @@ namespace ruby // Removed cache accessor using DataUpdate = BaseCache::DataUpdate; -class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named +class RubyPrefetcherProxy : public CacheAccessor, public Named { public: @@ -71,20 +74,37 @@ class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named /** * Notify PF probes hit/miss/fill */ - void notifyPfHit(const RequestPtr& req, bool is_read, + void notifyPfHit(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta, const DataBlock& data_blk); - void notifyPfMiss(const RequestPtr& req, bool is_read, + + void notifyPfMiss(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta, const DataBlock& data_blk); + void notifyPfFill(const RequestPtr& req, const DataBlock& data_blk, bool from_pf); - void notifyPfEvict(Addr blkAddr, bool hwPrefetched, + void notifyPfEvict(Addr blkAddr, bool hwPrefetched, XsPFMetaData& pfmeta, RequestorID requestorID); + void pfHitInCache(const XsPFMetaData& pfmeta); + + void notifyHitToDownStream(const RequestPtr& req); + + void offloadToDownStream(); + /** Registers probes. */ void regProbePoints(); private: + struct PfProxyStat : public statistics::Group + { + PfProxyStat(statistics::Group *parent); + statistics::Scalar notifymiss; + statistics::Scalar notifyhit; + statistics::Scalar issuedPf; + } stat; + + /** Schedule the next ready prefetch */ void scheduleNextPrefetch(); @@ -129,29 +149,49 @@ class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named /** Accessor functions */ - // bool inCache(Addr addr, bool is_secure) const override - // { - // return cacheCntrl->inCache(addr, is_secure); - // } - - // bool hasBeenPrefetched(Addr addr, bool is_secure) const override - // { - // return cacheCntrl->hasBeenPrefetched(addr, is_secure); - // } - - // bool hasBeenPrefetched(Addr addr, bool is_secure, - // RequestorID requestor) const override - // { - // return cacheCntrl->hasBeenPrefetched(addr, is_secure, requestor); - // } - - // bool inMissQueue(Addr addr, bool is_secure) const override - // { - // return cacheCntrl->inMissQueue(addr, is_secure); - // } - - // bool coalesce() const override - // { return cacheCntrl->coalesce(); } + bool inCache(Addr addr, bool is_secure) const override + { + return cacheCntrl->inCache(addr, is_secure); + } + + virtual unsigned level() const override + { + return cacheCntrl->level(); + } + + bool hasBeenPrefetched(Addr addr, bool is_secure) const override + { + return cacheCntrl->hasBeenPrefetched(addr, is_secure); + } + + bool hasBeenPrefetched(Addr addr, bool is_secure, + RequestorID requestor) const override + { + return cacheCntrl->hasBeenPrefetched(addr, is_secure, requestor); + } + + bool hasEverBeenPrefetched(Addr addr, bool is_secure) const override + { + return cacheCntrl->hasEverBeenPrefetched(addr, is_secure); + } + + Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) override + { + return cacheCntrl->getHitBlkXsMetadata(pkt->getAddr(), pkt->isSecure()); + } + + bool inMissQueue(Addr addr, bool is_secure) const override + { + return cacheCntrl->inMissQueue(addr, is_secure); + } + + bool coalesce() const override + { return cacheCntrl->coalesce(); } + + const uint8_t* findBlock(Addr addr, bool is_secure) const override + { + return cacheCntrl->findBlock(addr, is_secure)->getDataBlk().getData(0, RubySystem::getBlockSizeBytes()); + } }; diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 169eb9a21c..dffb5bb34d 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -67,8 +67,17 @@ namespace gem5 namespace ruby { +Sequencer::SequencerStat::SequencerStat(statistics::Group *parent) + : statistics::Group(parent, "Sequencer"), + ADD_STAT(notifymiss,""), + ADD_STAT(loadcancel,"") +{ +} + Sequencer::Sequencer(const Params &p) - : RubyPort(p), m_IncompleteTimes(MachineType_NUM), + : RubyPort(p), + stat(this), + m_IncompleteTimes(MachineType_NUM), deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check") { m_outstanding_count = 0; @@ -376,6 +385,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type, DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n", pkt, pkt->cmdString()); ruby_custom_signal_callback(pkt); + stat.loadcancel++; } } return RequestStatus_Aliased; @@ -629,7 +639,7 @@ void Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy) { assert(address == makeLineAddress(address)); - + stat.notifymiss++; auto it = m_RequestTable.find(address); assert(it != m_RequestTable.end()); @@ -639,16 +649,40 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy) for (auto &seq_req: seq_req_list) { if (seq_req.pkt->isRead()) { ruby_custom_signal_callback(seq_req.pkt); + stat.loadcancel++; } } m_BusyBlocks.insert(address); - DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads", + DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads\n", is_upgrade ? "load" : "store", address); return; } +void +Sequencer::TBEFullCancel(Addr address) +{ + assert(address == makeLineAddress(address)); + + auto it = m_RequestTable.find(address); + assert(it != m_RequestTable.end()); + + auto &seq_req_list = it->second; + + // cancel pending loads' speculation + for (auto &seq_req: seq_req_list) { + if (seq_req.pkt->isRead()) { + ruby_custom_signal_callback(seq_req.pkt); + stat.loadcancel++; + } + } + + DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads\n", + false ? "load" : "store", address); + return; +} + void Sequencer::atomicCallback(Addr address, DataBlock& data, const bool externalHit, const MachineType mach, diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index 85a2a1597a..0ed88d247d 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -129,6 +129,8 @@ class Sequencer : public RubyPort void notifyMissCallback(Addr address, bool is_upgrade, bool is_snoop); + void TBEFullCancel(Addr address); + void atomicCallback(Addr address, DataBlock& data, const bool externalHit = false, @@ -250,6 +252,14 @@ class Sequencer : public RubyPort RubyRequestType secondary_type); private: + + struct SequencerStat : public statistics::Group + { + SequencerStat(statistics::Group *parent); + statistics::Scalar notifymiss; + statistics::Scalar loadcancel; + } stat; + int m_max_outstanding_requests; int m_num_pending_invs; diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py index 3f570fb952..e3a13c87f8 100644 --- a/src/mem/ruby/system/Sequencer.py +++ b/src/mem/ruby/system/Sequencer.py @@ -104,7 +104,7 @@ class RubySequencer(RubyPort): dcache = Param.RubyCache("") max_outstanding_requests = Param.Int( - 16, "max requests (incl. prefetches) outstanding" + 1024, "max requests (incl. prefetches) outstanding" ) deadlock_threshold = Param.Cycles( 500000,