diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py
index 4a8c79880f..7cb14af14b 100644
--- a/configs/common/CacheConfig.py
+++ b/configs/common/CacheConfig.py
@@ -170,17 +170,14 @@ def config_cache(options, system):
             system.tol3bus = L2XBar(clk_domain=system.cpu_clk_domain, width=256)
             system.l3.cpu_side = system.tol3bus.mem_side_ports
             system.l3.mem_side = system.membus.cpu_side_ports
-            system.l3.max_cache_level = 3
 
         for i in range(options.num_cpus):
             if options.l3cache:
                 # l2 -> tol3bus -> l3
                 system.l2_caches[i].mem_side = system.tol3bus.cpu_side_ports
                 # l3 -> membus
-                system.l2_caches[i].max_cache_level = 3
             else:
                 system.l2_caches[i].mem_side = system.membus.cpu_side_ports
-                system.l2_caches[i].max_cache_level = 2
 
     if options.memchecker:
         system.memchecker = MemChecker()
@@ -189,14 +186,7 @@ def config_cache(options, system):
         if options.caches:
             icache = icache_class(**_get_cache_opts('l1i', options))
             dcache = dcache_class(**_get_cache_opts('l1d', options))
-            if options.l2cache:
-                icache.max_cache_level = 2
-                dcache.max_cache_level = 2
-            if options.l3cache:
-                icache.max_cache_level = 3
-                dcache.max_cache_level = 3
             if dcache.prefetcher != NULL:
-                print("Add dtb for L1D prefetcher")
                 dcache.prefetcher.registerTLB(system.cpu[i].mmu.dtb)
                 if options.l1d_hwp_type == 'XSCompositePrefetcher':
                     if options.l1d_enable_spp:
@@ -233,7 +223,6 @@ def config_cache(options, system):
                 dcache.prefetcher.add_pf_downstream(system.l2_caches[i].prefetcher)
                 system.l2_caches[i].prefetcher.queue_size = 64
                 system.l2_caches[i].prefetcher.max_prefetch_requests_with_pending_translation = 128
-                print("Add L2 prefetcher {} as downstream of L1D prefetcher {}".format(i, i))
 
             if options.l3cache and options.l2_to_l3_pf_hint:
                 assert system.l2_caches[i].prefetcher != NULL and \
@@ -241,7 +230,6 @@ def config_cache(options, system):
                 system.l2_caches[i].prefetcher.add_pf_downstream(system.l3.prefetcher)
                 system.l3.prefetcher.queue_size = 64
                 system.l3.prefetcher.max_prefetch_requests_with_pending_translation = 128
-                print("Add L3 prefetcher as downstream of L2 prefetcher {}".format(i))
 
             # If we have a walker cache specified, instantiate two
             # instances here
diff --git a/configs/common/Caches.py b/configs/common/Caches.py
index a2cb1ba20b..145ad665cd 100644
--- a/configs/common/Caches.py
+++ b/configs/common/Caches.py
@@ -89,8 +89,6 @@ class L2Cache(Cache):
     mshrs = 64
     tgts_per_mshr = 20
     clusivity='mostly_incl'
-    prefetch_on_access = True
-    #prefetch_on_access = False
     # always writeback clean when lower level is exclusive
     writeback_clean = True
 
@@ -110,7 +108,6 @@ class L3Cache(Cache):
     tgts_per_mshr = 20
     clusivity='mostly_excl'
     writeback_clean = False
-    prefetch_on_access = True
 
     # aligned latency:
     tag_latency = 2
diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py
new file mode 100644
index 0000000000..5e46a4518d
--- /dev/null
+++ b/configs/common/PrefetcherConfig.py
@@ -0,0 +1,52 @@
+import m5
+from m5.objects import *
+from common.Caches import *
+from common import ObjectList
+
+
+def _get_hwp(hwp_option):
+    if hwp_option == None:
+        return NULL
+
+    hwpClass = ObjectList.hwp_list.get(hwp_option)
+    return hwpClass()
+
+def create_prefetcher(cpu, cache_level, options):
+    prefetcher_attr = '{}_hwp_type'.format(cache_level)
+    prefetcher_name = ''
+    prefetcher = NULL
+    if hasattr(options, prefetcher_attr):
+        prefetcher_name = getattr(options, prefetcher_attr)
+        prefetcher = _get_hwp(prefetcher_name)
+        print(f"create_prefetcher at {cache_level}: {prefetcher_name}")
+
+    if prefetcher == NULL:
+        return NULL
+
+    if cpu != NULL:
+        prefetcher.registerTLB(cpu.mmu.dtb)
+
+    prefetcher.queue_size = 64
+
+    if prefetcher_name == 'XSCompositePrefetcher':
+        if options.l1d_enable_spp:
+            prefetcher.enable_spp = True
+        if options.l1d_enable_cplx:
+            prefetcher.enable_cplx = True
+        prefetcher.pht_pf_level = options.pht_pf_level
+        prefetcher.short_stride_thres = options.short_stride_thres
+        prefetcher.fuzzy_stride_matching = False
+        prefetcher.stream_pf_ahead = True
+        prefetcher.bop_large.delay_queue_enable = True
+        prefetcher.bop_large.bad_score = 10
+        prefetcher.bop_small.delay_queue_enable = True
+        prefetcher.bop_small.bad_score = 5
+        prefetcher.queue_size = 128
+        prefetcher.max_prefetch_requests_with_pending_translation = 128
+        prefetcher.region_size = 64*16  # 64B * blocks per region
+
+        prefetcher.berti.use_byte_addr = True
+        prefetcher.berti.aggressive_pf = False
+        prefetcher.berti.trigger_pht = True
+
+    return prefetcher
\ No newline at end of file
diff --git a/configs/common/cores/arm/O3_ARM_v7a.py b/configs/common/cores/arm/O3_ARM_v7a.py
index d032a1aa88..5bef6251dc 100644
--- a/configs/common/cores/arm/O3_ARM_v7a.py
+++ b/configs/common/cores/arm/O3_ARM_v7a.py
@@ -179,7 +179,6 @@ class O3_ARM_v7aL2(Cache):
     size = '1MB'
     assoc = 16
     write_buffers = 8
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=8, latency = 1)
diff --git a/configs/common/cores/arm/ex5_LITTLE.py b/configs/common/cores/arm/ex5_LITTLE.py
index 57f6a6b812..ba2504c74e 100644
--- a/configs/common/cores/arm/ex5_LITTLE.py
+++ b/configs/common/cores/arm/ex5_LITTLE.py
@@ -122,7 +122,6 @@ class L2(Cache):
     size = '512kB'
     assoc = 8
     write_buffers = 16
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=1, latency = 1)
diff --git a/configs/common/cores/arm/ex5_big.py b/configs/common/cores/arm/ex5_big.py
index de7a45063a..c5cad4957a 100644
--- a/configs/common/cores/arm/ex5_big.py
+++ b/configs/common/cores/arm/ex5_big.py
@@ -174,7 +174,6 @@ class L2(Cache):
     size = '2MB'
     assoc = 16
     write_buffers = 8
-    prefetch_on_access = True
     clusivity = 'mostly_excl'
     # Simple stride prefetcher
     prefetcher = StridePrefetcher(degree=8, latency = 1)
diff --git a/configs/ruby/CHI.py b/configs/ruby/CHI.py
index a9e8eeb7e6..683e6abd9e 100644
--- a/configs/ruby/CHI.py
+++ b/configs/ruby/CHI.py
@@ -111,25 +111,29 @@ def create_system(
     # dataAccessLatency may be set to 0 if one wants to consider parallel
     # data and tag lookups
     class L1ICache(RubyCache):
+        level = 1
         dataAccessLatency = 1
         tagAccessLatency = 1
         size = options.l1i_size
         assoc = options.l1i_assoc
 
     class L1DCache(RubyCache):
+        level = 1
         dataAccessLatency = 0
         tagAccessLatency = 1
         size = options.l1d_size
         assoc = options.l1d_assoc
 
     class L2Cache(RubyCache):
-        dataAccessLatency = 6
+        level = 2
+        dataAccessLatency = 13
         tagAccessLatency = 2
         size = options.l2_size
         assoc = options.l2_assoc
 
     class HNFCache(RubyCache):
-        dataAccessLatency = 10
+        level = 3
+        dataAccessLatency = 17
         tagAccessLatency = 2
         size = options.l3_size
         assoc = options.l3_assoc
@@ -154,11 +158,13 @@ class HNFCache(RubyCache):
             L1ICache,
             L1DCache,
             system.cache_line_size.value,
+            options
         )
         for cpu in cpus
     ]
+
     for rnf in ruby_system.rnf:
-        rnf.addPrivL2Cache(L2Cache)
+        rnf.addPrivL2Cache(L2Cache, options)
         cpu_sequencers.extend(rnf.getSequencers())
         all_cntrls.extend(rnf.getAllControllers())
         network_nodes.append(rnf)
@@ -191,10 +197,17 @@ class HNFCache(RubyCache):
     hnf_list = [i for i in range(options.num_l3caches)]
     CHI_HNF.createAddrRanges(sysranges, system.cache_line_size.value, hnf_list)
     ruby_system.hnf = [
-        CHI_HNF(i, ruby_system, HNFCache, None)
+        CHI_HNF(i, ruby_system, HNFCache, options, None)
         for i in range(options.num_l3caches)
     ]
 
+    if options.l2_to_l3_pf_hint:
+        if len(ruby_system.hnf) > 1:
+            Warning("L2 to L3 prefetch hint is not supported with multiple HNFs")
+        else:
+            for rnf in ruby_system.rnf:
+                rnf.addLLCPrefetcherDownstream(ruby_system.hnf[0].getPrefetcher())
+
     for hnf in ruby_system.hnf:
         network_nodes.append(hnf)
         network_cntrls.extend(hnf.getNetworkSideControllers())
diff --git a/configs/ruby/CHI_config.py b/configs/ruby/CHI_config.py
index b04d144207..9df852f83c 100644
--- a/configs/ruby/CHI_config.py
+++ b/configs/ruby/CHI_config.py
@@ -49,7 +49,7 @@
 
 import m5
 from m5.objects import *
-
+from common.PrefetcherConfig import create_prefetcher
 
 class Versions:
     """
@@ -86,9 +86,9 @@ class NoC_Params:
     router_link_latency = 1
     node_link_latency = 1
     router_latency = 1
-    router_buffer_size = 4
-    cntrl_msg_size = 8
-    data_width = 32
+    router_buffer_size = 128
+    cntrl_msg_size = 0
+    data_width = 64
     cross_links = []
     cross_link_latency = 0
 
@@ -222,6 +222,10 @@ def __init__(self, ruby_system):
         # timeouts on unique lines when a store conditional fails
         self.sc_lock_enabled = False
 
+    def setPrefetcher(self, pf):
+        self.prefetcher = pf
+        self.use_prefetcher = pf != NULL
+
 
 class CHI_L1Controller(CHI_Cache_Controller):
     """
@@ -249,11 +253,11 @@ def __init__(self, ruby_system, sequencer, cache, prefetcher, is_dcache=False):
         self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
-        self.dealloc_backinv_unique = True
-        self.dealloc_backinv_shared = True
+        self.dealloc_backinv_unique = False
+        self.dealloc_backinv_shared = False
         self.is_dcache = is_dcache
         # Some reasonable default TBE params
-        self.number_of_TBEs = 16
+        self.number_of_TBEs = 32+8
         self.number_of_repl_TBEs = 16
         self.number_of_snoop_TBEs = 4
         self.number_of_DVM_TBEs = 16
@@ -261,6 +265,9 @@ def __init__(self, ruby_system, sequencer, cache, prefetcher, is_dcache=False):
 
         self.unify_repl_TBEs = False
 
+        self.response_latency = 4
+        self.request_latency = 1
+
 
 class CHI_L2Controller(CHI_Cache_Controller):
     """
@@ -288,16 +295,18 @@ def __init__(self, ruby_system, cache, prefetcher):
         self.alloc_on_atomic = False
         self.dealloc_on_unique = False
         self.dealloc_on_shared = False
-        self.dealloc_backinv_unique = True
-        self.dealloc_backinv_shared = True
+        self.dealloc_backinv_unique = False
+        self.dealloc_backinv_shared = False
         # Some reasonable default TBE params
-        self.number_of_TBEs = 32
+        self.number_of_TBEs = 64+16
         self.number_of_repl_TBEs = 32
-        self.number_of_snoop_TBEs = 16
+        self.number_of_snoop_TBEs = 32
         self.number_of_DVM_TBEs = 1  # should not receive any dvm
         self.number_of_DVM_snoop_TBEs = 1  # should not receive any dvm
         self.unify_repl_TBEs = False
 
+        self.response_latency = 12
+        self.request_latency = 1
 
 class CHI_HNFController(CHI_Cache_Controller):
     """
@@ -329,13 +338,17 @@ def __init__(self, ruby_system, cache, prefetcher, addr_ranges):
         self.dealloc_backinv_unique = False
         self.dealloc_backinv_shared = False
         # Some reasonable default TBE params
-        self.number_of_TBEs = 32
+        self.number_of_TBEs = 256 + 32
         self.number_of_repl_TBEs = 32
         self.number_of_snoop_TBEs = 1  # should not receive any snoop
         self.number_of_DVM_TBEs = 1  # should not receive any dvm
         self.number_of_DVM_snoop_TBEs = 1  # should not receive any dvm
         self.unify_repl_TBEs = False
 
+        self.response_latency = 40
+        self.request_latency = 1
+
+
 
 class CHI_MNController(MiscNode_Controller):
     """
@@ -466,8 +479,7 @@ def __init__(
         l1Icache_type,
         l1Dcache_type,
         cache_line_size,
-        l1Iprefetcher_type=None,
-        l1Dprefetcher_type=None,
+        options
     ):
         super().__init__(ruby_system)
 
@@ -505,16 +517,13 @@ def __init__(
                 start_index_bit=self._block_size_bits, is_icache=False
             )
 
-            # prefetcher wrappers
-            if l1Iprefetcher_type != None:
-                l1i_pf = l1Iprefetcher_type()
-            else:
-                l1i_pf = NULL
+            # create icache prefetcher
+            l1i_pf = NULL
 
-            if l1Dprefetcher_type != None:
-                l1d_pf = l1Dprefetcher_type()
-            else:
-                l1d_pf = NULL
+            # create dcache prefetcher
+            l1d_pf = create_prefetcher(cpu, 'l1d', options)
+            if l1d_pf != NULL and options.cpu_type == 'DerivO3CPU':
+                    cpu.add_pf_downstream(l1d_pf)
 
             # cache controllers
             cpu.l1i = CHI_L1Controller(
@@ -552,30 +561,38 @@ def setDownstream(self, cntrls):
     def getCpus(self):
         return self._cpus
 
+    def getL1DCachePrefetcher(self, cpu_idx):
+        return self._cpus[cpu_idx].l1d.prefetcher
+
+    def getL2CachePrefetcher(self, cpu_idx):
+        return self._cpus[cpu_idx].l2.prefetcher
+
     # Adds a private L2 for each cpu
-    def addPrivL2Cache(self, cache_type, pf_type=None):
+    def addPrivL2Cache(self, cache_type, options):
         self._ll_cntrls = []
         for cpu in self._cpus:
             l2_cache = cache_type(
                 start_index_bit=self._block_size_bits, is_icache=False
             )
 
-            if pf_type != None:
-                l2_pf = pf_type()
-            else:
-                l2_pf = NULL
+            l2_pf = create_prefetcher(NULL, 'l2', options)
+            if l2_pf != NULL and options.l1_to_l2_pf_hint:
+                cpu.l1d.prefetcher.add_pf_downstream(l2_pf)
 
             cpu.l2 = CHI_L2Controller(self._ruby_system, l2_cache, l2_pf)
 
             self._cntrls.append(cpu.l2)
             self.connectController(cpu.l2)
-
             self._ll_cntrls.append(cpu.l2)
 
             for c in cpu._ll_cntrls:
                 c.downstream_destinations = [cpu.l2]
             cpu._ll_cntrls = [cpu.l2]
 
+    def addLLCPrefetcherDownstream(self, llc_pf):
+        for cpu in self._cpus:
+            cpu.l2.prefetcher.add_pf_downstream(llc_pf)
+
 
 class CHI_HNF(CHI_Node):
     """
@@ -618,16 +635,18 @@ def getAddrRanges(cls, hnf_idx):
 
     # The CHI controller can be a child of this object or another if
     # 'parent' if specified
-    def __init__(self, hnf_idx, ruby_system, llcache_type, parent):
+    def __init__(self, hnf_idx, ruby_system, llcache_type, options, parent):
         super().__init__(ruby_system)
 
         addr_ranges, intlvHighBit = self.getAddrRanges(hnf_idx)
         # All ranges should have the same interleaving
         assert len(addr_ranges) >= 1
 
+        llc_pf = create_prefetcher(NULL, 'l3', options)
+
         ll_cache = llcache_type(start_index_bit=intlvHighBit + 1)
         self._cntrl = CHI_HNFController(
-            ruby_system, ll_cache, NULL, addr_ranges
+            ruby_system, ll_cache, llc_pf, addr_ranges
         )
 
         if parent == None:
@@ -637,6 +656,9 @@ def __init__(self, hnf_idx, ruby_system, llcache_type, parent):
 
         self.connectController(self._cntrl)
 
+    def getPrefetcher(self):
+        return self._cntrl.prefetcher
+
     def getAllControllers(self):
         return [self._cntrl]
 
@@ -698,6 +720,7 @@ def __init__(self, ruby_system, parent):
             requestToMemory=MemCtrlMessageBuffer(),
             reqRdy=TriggerMessageBuffer(),
             transitions_per_cycle=1024,
+            number_of_TBEs = 1024
         )
 
         # The Memory_Controller implementation deallocates the TBE for
diff --git a/configs/ruby/Ruby.py b/configs/ruby/Ruby.py
index 305f6f5ccb..0e31196bab 100644
--- a/configs/ruby/Ruby.py
+++ b/configs/ruby/Ruby.py
@@ -69,7 +69,7 @@ def define_options(parser):
         "--ruby-clock",
         action="store",
         type=str,
-        default="2GHz",
+        default="3GHz",
         help="Clock for blocks running at Ruby system's speed",
     )
 
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index 23ed3833ff..5b7d16bb00 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -112,7 +112,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst)
         }
         // check bypass data ready or not
         if (!scheduler->bypassScoreboard[src->flatIndex()]) {
-            panic("[sn %lu] %s can't get data from bypassNetwork\n", inst->seqNum, inst->srcRegIdx(i));
+            auto dst_inst = scheduler->getInstByDstReg(src->flatIndex());
+            panic("[sn %lu] %s can't get data from bypassNetwork, dst inst: %s\n", inst->seqNum, inst->srcRegIdx(i),
+                  dst_inst->genDisassembly());
         }
     }
     inst->checkOldVdElim();
@@ -578,6 +580,20 @@ Scheduler::full(const DynInstPtr& inst)
     return true;
 }
 
+DynInstPtr
+Scheduler::getInstByDstReg(RegIndex flatIdx)
+{
+    for (auto iq : issueQues)
+    {
+        for (auto& inst : iq->instList){
+            if (inst->numDestRegs() > 0 && inst->renamedDestIdx(0)->flatIndex() == flatIdx) {
+                return inst;
+            }
+        }
+    }
+    return nullptr;
+}
+
 void
 Scheduler::addProducer(const DynInstPtr& inst)
 {
@@ -693,6 +709,9 @@ Scheduler::insertSlot(const DynInstPtr& inst)
 
 void Scheduler::loadCancel(const DynInstPtr& inst)
 {
+    if (inst->canceled()) {
+        return;
+    }
     DPRINTF(Schedule, "[sn %lu] %s cache miss, cancel consumers\n", inst->seqNum,
             enums::OpClassStrings[inst->opClass()]);
     inst->setCancel();
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index 867e808e57..d569de074a 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -218,6 +218,7 @@ class Scheduler : public SimObject
     void issueAndSelect();
     bool full(const DynInstPtr& inst);
     bool ready(const DynInstPtr& inst);
+    DynInstPtr getInstByDstReg(RegIndex flatIdx);
 
     void addProducer(const DynInstPtr& inst);
     // return true if insert successful
diff --git a/src/dev/riscv/nemu_mmc.cc b/src/dev/riscv/nemu_mmc.cc
index fc30e28c8c..255d31f449 100644
--- a/src/dev/riscv/nemu_mmc.cc
+++ b/src/dev/riscv/nemu_mmc.cc
@@ -17,7 +17,6 @@ NemuMMC::NemuMMC(const NemuMMCParams *p)
     , write_cmd(false)
     , read_ext_csd(false)
 {
-    printf("111\n");
     assert(C_SIZE < (1 << 12));
     sd_reg_base = (uint32_t *)malloc(0x80);
     img_fp = fopen(p->img_path.c_str(), "rb");
@@ -31,7 +30,6 @@ NemuMMC::NemuMMC(const NemuMMCParams *p)
 inline void
 NemuMMC::prepare_rw(int is_write)
 {
-    printf("222\n");
     blk_addr = sd_reg_base[SDARG];
     tmp_addr = 0;
     if (img_fp)
@@ -42,7 +40,6 @@ NemuMMC::prepare_rw(int is_write)
 void
 NemuMMC::sdcard_handle_cmd(int cmd)
 {
-    // printf("333\n");
     switch (cmd) {
         case MMC_GO_IDLE_STATE:
             break;
@@ -101,7 +98,6 @@ NemuMMC::sdcard_handle_cmd(int cmd)
 void
 NemuMMC::sdcard_io_handler(uint32_t offset)
 {
-    // printf("444\n");
     assert(img_fp);
     int idx = offset / 4;
     switch (idx) {
@@ -149,7 +145,6 @@ NemuMMC::sdcard_io_handler(uint32_t offset)
 void
 NemuMMC::unserialize_sdcard(FILE *sdfp)
 {
-    // printf("555\n");
     __attribute__((unused)) int ret;
     ret = fread(sd_reg_base, 4, 0x80 / 4, sdfp);
     ret = fread(&tmp_addr, 4, 1, sdfp);
@@ -163,7 +158,6 @@ NemuMMC::unserialize_sdcard(FILE *sdfp)
 Tick
 NemuMMC::read(PacketPtr pkt)
 {
-    // printf("666\n");
     assert(pkt->getSize() == 4);
     Addr offset = pkt->getAddr() - pioAddr;
     // handler before read
@@ -180,7 +174,6 @@ NemuMMC::read(PacketPtr pkt)
 Tick
 NemuMMC::write(PacketPtr pkt)
 {
-    //printf("777\n");
     assert(pkt->getSize() == 4);
     Addr offset = pkt->getAddr() - pioAddr;
     uint32_t write_val = pkt->getRaw<uint32_t>();
diff --git a/src/mem/cache/Cache.py b/src/mem/cache/Cache.py
index 791a2223c8..1a0aaa7768 100644
--- a/src/mem/cache/Cache.py
+++ b/src/mem/cache/Cache.py
@@ -98,11 +98,6 @@ class BaseCache(ClockedObject):
     enable_wayprediction = Param.Bool(True, "enablewaypredction")
 
     prefetcher = Param.BasePrefetcher(NULL,"Prefetcher attached to cache")
-    prefetch_on_access = Param.Bool(False,
-         "Notify the hardware prefetcher on every access (not just misses)")
-    prefetch_on_pf_hit = Param.Bool(False,
-        "Notify the hardware prefetcher on hit on prefetched lines")
-
     tags = Param.BaseTags(BaseSetAssoc(), "Tag store")
     replacement_policy = Param.BaseReplacementPolicy(LRURP(),
         "Replacement policy")
@@ -157,8 +152,6 @@ class BaseCache(ClockedObject):
 
     cache_level = Param.Unsigned(0, "Cache level (L1 is 1, L2 is 2, etc.)")
 
-    max_cache_level = Param.Unsigned(2, "Max Cache level (L1 is 1, L2 is 2, etc.)")
-
     force_hit = Param.Bool(False, "Force some PC to hit in L1")
     way_entries = Param.MemorySize(
         "64",
diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc
index 38d5fff7a7..146dcf35a5 100644
--- a/src/mem/cache/base.cc
+++ b/src/mem/cache/base.cc
@@ -145,7 +145,6 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size)
       system(p.system),
       stats(*this),
       cacheLevel(p.cache_level),
-      maxCacheLevel(p.max_cache_level),
       forceHit(p.force_hit)
 {
     // the MSHR queue has no reserve entries as we check the MSHR
@@ -166,7 +165,7 @@ BaseCache::BaseCache(const BaseCacheParams &p, unsigned blk_size)
     assert(!((size != DEFAULTWAYPRESIZE) && enableWayPrediction));
 
     if (prefetcher)
-        prefetcher->setCache(this);
+        prefetcher->setParentInfo(system, getProbeManager(), this, getBlockSize());
 
     fatal_if(compressor && !dynamic_cast<CompressedTags*>(tags),
         "The tags of compressed cache %s must derive from CompressedTags",
@@ -1150,28 +1149,28 @@ BaseCache::getNextQueueEntry()
         PacketPtr pkt = prefetcher->getPacket();
         if (pkt) {
             Addr pf_addr = pkt->getBlockAddr(blkSize);
-            int pf_num = pkt->req->getXsMetadata().prefetchSource;
+            PrefetchSourceType pf_type = pkt->req->getXsMetadata().prefetchSource;
             if (tags->findBlock(pf_addr, pkt->isSecure())) {
                 DPRINTF(HWPrefetch, "Prefetch %#x has hit in cache, "
                         "dropped.\n", pf_addr);
-                prefetcher->pfHitInCache(pf_num);
-                if (pf_num == 1)
+                prefetcher->pfHitInCache(pf_type);
+                if (pf_type == PrefetchSourceType::SStream)
                     prefetcher->streamPflate();
                 // free the request and packet
                 delete pkt;
             } else if (mshrQueue.findMatch(pf_addr, pkt->isSecure())) {
                 DPRINTF(HWPrefetch, "Prefetch %#x has hit in a MSHR, "
                         "dropped.\n", pf_addr);
-                prefetcher->pfHitInMSHR(pf_num);
-                if (pf_num == 1)
+                prefetcher->pfHitInMSHR(pf_type);
+                if (pf_type == PrefetchSourceType::SStream)
                     prefetcher->streamPflate();
                 // free the request and packet
                 delete pkt;
             } else if (writeBuffer.findMatch(pf_addr, pkt->isSecure())) {
                 DPRINTF(HWPrefetch, "Prefetch %#x has hit in the "
                         "Write Buffer, dropped.\n", pf_addr);
-                prefetcher->pfHitInWB(pf_num);
-                if (pf_num == 1)
+                prefetcher->pfHitInWB(pf_type);
+                if (pf_type == PrefetchSourceType::SStream)
                     prefetcher->streamPflate();
                 // free the request and packet
                 delete pkt;
diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh
index ab7607f4c5..54c0169086 100644
--- a/src/mem/cache/base.hh
+++ b/src/mem/cache/base.hh
@@ -60,6 +60,7 @@
 #include "debug/CacheTrace.hh"
 #include "enums/Clusivity.hh"
 #include "mem/cache/cache_blk.hh"
+#include "mem/cache/cache_probe_arg.hh"
 #include "mem/cache/compressors/base.hh"
 #include "mem/cache/mshr_queue.hh"
 #include "mem/cache/prefetch/associative_set.hh"
@@ -95,7 +96,7 @@ struct BaseCacheParams;
 /**
  * A basic cache interface. Implements some common functions for speed.
  */
-class BaseCache : public ClockedObject
+class BaseCache : public ClockedObject, CacheAccessor
 {
   protected:
     /**
@@ -1365,50 +1366,6 @@ class BaseCache : public ClockedObject
         memSidePort.schedSendEvent(time);
     }
 
-    bool inCache(Addr addr, bool is_secure) const {
-        return tags->findBlock(addr, is_secure);
-    }
-
-    bool hasBeenPrefetched(Addr addr, bool is_secure) const {
-        CacheBlk *block = tags->findBlock(addr, is_secure);
-        if (block) {
-            return block->wasEverPrefetched();
-        } else {
-            return false;
-        }
-    }
-
-    bool hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const {
-        CacheBlk *block = tags->findBlock(addr, is_secure);
-        if (block) {
-            return block->wasPrefetched();
-        } else {
-            return false;
-        }
-    }
-
-    CacheBlk* findBlock(Addr addr, bool is_secure){
-        return tags->findBlock(addr, is_secure);
-    }
-
-    Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt)
-    {
-        CacheBlk *block = tags->findBlock(pkt->getAddr(), pkt->isSecure());
-        assert(block);
-        /* clean prefetchSource if the block was not prefetched */
-        if (!block->wasEverPrefetched()) {
-            Request::XsMetadata blkMeta = block->getXsMetadata();
-            blkMeta.prefetchSource = PrefetchSourceType::PF_NONE;
-            block->setXsMetadata(blkMeta);
-        }
-        return block->getXsMetadata();
-    }
-
-
-    bool inMissQueue(Addr addr, bool is_secure) const {
-        return mshrQueue.findMatch(addr, is_secure);
-    }
-
     void incMissCount(PacketPtr pkt)
     {
         assert(pkt->req->requestorId() < system->maxRequestors());
@@ -1455,14 +1412,6 @@ class BaseCache : public ClockedObject
         }
     }
 
-    /**
-     * Checks if the cache is coalescing writes
-     *
-     * @return True if the cache is coalescing writes
-     */
-    bool coalesce() const;
-
-
     /**
      * Cache block visitor that writes back dirty cache blocks using
      * functional writes.
@@ -1508,7 +1457,7 @@ class BaseCache : public ClockedObject
 
     const unsigned cacheLevel{0};
 
-    const unsigned maxCacheLevel;
+    //const unsigned maxCacheLevel;
 
     const bool dumpMissPC{false};
 
@@ -1520,7 +1469,60 @@ class BaseCache : public ClockedObject
     const bool forceHit;
 
 public:
-    unsigned level() { return cacheLevel; }
+    // CacheAccessor overrided function
+
+    bool inCache(Addr addr, bool is_secure) const override { return tags->findBlock(addr, is_secure); }
+
+    unsigned level() const override { return cacheLevel; }
+
+    bool hasBeenPrefetched(Addr addr, bool is_secure) const override
+    {
+        CacheBlk *block = tags->findBlock(addr, is_secure);
+        if (block) {
+            return block->wasPrefetched();
+        } else {
+            return false;
+        }
+    }
+
+    bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) const override {
+        panic("hasBeenPrefetched Not implemented");
+        return false;
+    }
+
+    bool hasEverBeenPrefetched(Addr addr, bool is_secure) const override
+    {
+        CacheBlk *block = tags->findBlock(addr, is_secure);
+        if (block) {
+            return block->wasEverPrefetched();
+        } else {
+            return false;
+        }
+    }
+
+    Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) override
+    {
+        CacheBlk *block = tags->findBlock(pkt->getAddr(), pkt->isSecure());
+        assert(block);
+        /* clean prefetchSource if the block was not prefetched */
+        if (!block->wasEverPrefetched()) {
+            Request::XsMetadata blkMeta = block->getXsMetadata();
+            blkMeta.prefetchSource = PrefetchSourceType::PF_NONE;
+            block->setXsMetadata(blkMeta);
+        }
+        return block->getXsMetadata();
+    }
+
+    bool inMissQueue(Addr addr, bool is_secure) const override {
+        return mshrQueue.findMatch(addr, is_secure);
+    }
+
+    bool coalesce() const override;
+
+    const uint8_t* findBlock(Addr addr, bool is_secure) const override {
+        auto blk = tags->findBlock(addr, is_secure);
+        return blk->data;
+    }
 };
 
 /**
diff --git a/src/mem/cache/cache_blk.hh b/src/mem/cache/cache_blk.hh
index 5363b92dff..424914e799 100644
--- a/src/mem/cache/cache_blk.hh
+++ b/src/mem/cache/cache_blk.hh
@@ -206,7 +206,7 @@ class CacheBlk : public TaggedEntry
     {
         TaggedEntry::invalidate();
 
-        clearPrefetched();
+        clearAllPrefetched();
         clearPendingInvalidate();
         clearCoherenceBits(AllBits);
 
diff --git a/src/mem/cache/cache_probe_arg.hh b/src/mem/cache/cache_probe_arg.hh
new file mode 100644
index 0000000000..ea7b10d88a
--- /dev/null
+++ b/src/mem/cache/cache_probe_arg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __MEM_CACHE_PROBE_ARG_HH__
+#define __MEM_CACHE_PROBE_ARG_HH__
+
+#include "mem/packet.hh"
+
+namespace gem5
+{
+
+/**
+ * Provides generic cache lookup functions. A cache may provide
+ * a CacheAccessor object to other components that need to perform
+ * a lookup outside the normal cache control flow. Currently this
+ * is used by prefetchers that perform lookups when notified by
+ * cache events.
+ */
+struct CacheAccessor
+{
+    /** Determine if address is in cache */
+    virtual bool inCache(Addr addr, bool is_secure) const = 0;
+
+    // cache level, l1 is 1, l2 is 2, etc.
+    virtual unsigned level() const = 0;
+
+    /** Determine if address has been prefetched */
+    virtual bool hasBeenPrefetched(Addr addr, bool is_secure) const = 0;
+
+    /** Determine if address has been prefetched by the requestor */
+    virtual bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) const = 0;
+
+    virtual bool hasEverBeenPrefetched(Addr addr, bool is_secure) const = 0;
+
+    virtual Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) = 0;
+
+    /** Determine if address is in cache miss queue */
+    virtual bool inMissQueue(Addr addr, bool is_secure) const = 0;
+
+    /** Determine if cache is coalescing writes */
+    virtual bool coalesce() const = 0;
+
+    virtual const uint8_t* findBlock(Addr addr, bool is_secure) const = 0;
+};
+
+/**
+ * Information provided to probes on a cache event.
+ * @sa ppHit, ppMiss, ppFill in gem5::BaseCache (src/mem/cache/base.hh)
+ */
+class CacheAccessProbeArg
+{
+  public:
+    /** Packet that triggered the cache access*/
+    PacketPtr pkt;
+    /** Accessor for the cache */
+    CacheAccessor &cache;
+
+    CacheAccessProbeArg(PacketPtr _pkt, CacheAccessor &_cache)
+        :pkt(_pkt), cache(_cache)
+    {
+    }
+};
+
+/**
+ * A data contents update is composed of the updated block's address,
+ * the old contents, and the new contents.
+ * @sa ppDataUpdate in gem5::BaseCache (src/mem/cache/base.hh)
+ */
+struct CacheDataUpdateProbeArg
+{
+    /** The updated block's address. */
+    Addr addr;
+    /** Whether the block belongs to the secure address space. */
+    bool isSecure;
+    /** Block original requestor */
+    const RequestorID requestorID;
+    /** The stale data contents. If zero-sized this update is a fill. */
+    std::vector<uint64_t> oldData;
+    /** The new data contents. If zero-sized this is an invalidation. */
+    std::vector<uint64_t> newData;
+    /** Set if the update is from a prefetch or evicting a prefetched
+    *  block that was never used. */
+    bool hwPrefetched;
+    /** Accessor for the cache */
+    CacheAccessor &accessor;
+
+    CacheDataUpdateProbeArg(Addr _addr, bool is_secure,
+                            RequestorID _requestorID,
+                            CacheAccessor &_accessor)
+        : addr(_addr), isSecure(is_secure), requestorID(_requestorID),
+          oldData(), newData(), accessor(_accessor)
+    {
+    }
+};
+
+} // namespace gem5
+
+#endif //__MEM_CACHE_PROBE_ARG_HH__
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index 5f6621e7c8..75649d9d94 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -77,9 +77,9 @@ class BasePrefetcher(ClockedObject):
     on_write = Param.Bool(True, "Notify prefetcher on writes")
     on_data  = Param.Bool(True, "Notify prefetcher on data accesses")
     on_inst  = Param.Bool(True, "Notify prefetcher on instruction accesses")
-    prefetch_on_access = Param.Bool(Parent.prefetch_on_access,
+    prefetch_on_access = Param.Bool(False,
         "Notify the hardware prefetcher on every access (not just misses)")
-    prefetch_on_pf_hit = Param.Bool(Parent.prefetch_on_pf_hit,
+    prefetch_on_pf_hit = Param.Bool(False,
         "Notify the hardware prefetcher on hit on prefetched lines")
     use_virtual_addresses = Param.Bool(False,
         "Use virtual addresses for prefetching")
@@ -88,7 +88,6 @@ class BasePrefetcher(ClockedObject):
 
     is_sub_prefetcher = Param.Bool(False, "Is this a sub-prefetcher")
 
-    max_cache_level = Param.Unsigned(Parent.max_cache_level , "Max Cache level (L1 is 1, L2 is 2, etc.)")
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -104,10 +103,12 @@ def addEvent(self, newObject):
     def regProbeListeners(self):
         print("Registering probe listeners for Prefetcher {}".format(self))
         for tlb in self._tlbs:
+            print(f"{self} addTLB {tlb}")
             self.getCCObject().addTLB(tlb.getCCObject())
 
         assert len(self._downstream_pf) <= 1
         if len(self._downstream_pf):
+            print(f"{self} addHintDownStream {self._downstream_pf[0]}")
             self.getCCObject().addHintDownStream(self._downstream_pf[0].getCCObject())
 
         for event in self._events:
@@ -329,6 +330,7 @@ class WorkerPrefetcher(QueuedPrefetcher):
     on_data = True
     on_miss = False
 
+    prefetch_on_access = True
     prefetch_on_pf_hit = True
     use_virtual_addresses = True
 
diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index 1731761cf7..0cbdcef5ad 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -126,7 +126,7 @@ Base::PrefetchListener::notify(const PacketPtr &pkt)
 
 Base::Base(const BasePrefetcherParams &p)
     : ClockedObject(p),
-      listeners(), cache(nullptr), isSubPrefetcher(p.is_sub_prefetcher),
+      listeners(), isSubPrefetcher(p.is_sub_prefetcher),
       archDBer(p.arch_db), blkSize(p.block_size),
       lBlkSize(floorLog2(blkSize)), onMiss(p.on_miss), onRead(p.on_read),
       onWrite(p.on_write), onData(p.on_data), onInst(p.on_inst),
@@ -136,19 +136,20 @@ Base::Base(const BasePrefetcherParams &p)
       prefetchOnPfHit(p.prefetch_on_pf_hit),
       useVirtualAddresses(p.use_virtual_addresses),
       prefetchStats(this), issuedPrefetches(0),
-      usefulPrefetches(0), streamlatenum(0),tlb(nullptr), maxCacheLevel(p.max_cache_level),
-      probeManagerDirty(nullptr)
+      usefulPrefetches(0), streamlatenum(0),tlb(nullptr)
 {
 }
 
 void
-Base::setCache(BaseCache *_cache)
+Base::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size)
 {
-    assert(!cache);
+    assert(!cache && !system && !probeManager);
+    system = sys;
+    probeManager = pm;
     cache = _cache;
 
     // If the cache has a different block size from the system's, save it
-    blkSize = cache->getBlockSize();
+    blkSize = blk_size;
     lBlkSize = floorLog2(blkSize);
 }
 
@@ -244,7 +245,7 @@ Base::observeAccess(const PacketPtr &pkt, bool miss) const
 
     if (!miss) {
         if (prefetchOnPfHit)
-            return hasBeenPrefetched(pkt->getAddr(), pkt->isSecure());
+            return hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure());
         if (!prefetchOnAccess)
             return false;
     }
@@ -282,9 +283,9 @@ Base::hasBeenPrefetched(Addr addr, bool is_secure) const
 }
 
 bool
-Base::hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const
+Base::hasEverBeenPrefetched(Addr addr, bool is_secure) const
 {
-    return cache->hasBeenPrefetchedAndNotAccessed(addr, is_secure);
+    return cache->hasEverBeenPrefetched(addr, is_secure);
 }
 
 bool
@@ -358,7 +359,7 @@ Base::probeNotify(const PacketPtr &pkt, bool miss)
 
     DPRINTF(HWPrefetch, "Reach condition checked\n");
 
-    if (hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure())) {
+    if (hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())) {
         usefulPrefetches += 1;
         prefetchStats.pfUseful++;
         PrefetchSourceType pf_source = cache->getHitBlkXsMetadata(pkt).prefetchSource;
@@ -386,9 +387,9 @@ Base::probeNotify(const PacketPtr &pkt, bool miss)
             PrefetchInfo pfi(pkt, pkt->req->hasVaddr() ? pkt->req->getVaddr() : pkt->req->getPaddr(), miss,
                              Request::XsMetadata(pf_source, pf_depth));
             pfi.setReqAfterSquash(squashMark);
-            pfi.setEverPrefetched(hasBeenPrefetched(pkt->getAddr(), pkt->isSecure()));
-            pfi.setPfFirstHit(!miss && hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure()));
-            pfi.setPfHit(!miss && hasBeenPrefetched(pkt->getAddr(), pkt->isSecure()));
+            pfi.setEverPrefetched(hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure()));
+            pfi.setPfFirstHit(!miss && hasBeenPrefetched(pkt->getAddr(), pkt->isSecure()));
+            pfi.setPfHit(!miss && hasEverBeenPrefetched(pkt->getAddr(), pkt->isSecure()));
             squashMark = false;
             notify(pkt, pfi);
         } else {
@@ -431,18 +432,11 @@ Base::regProbeListeners()
      * parent cache using the probe "Miss". Also connect to "Hit", if the
      * cache is configured to prefetch on accesses.
      */
-    if (listeners.empty() && !isSubPrefetcher) {
-        assert((cache != nullptr) != (probeManagerDirty != nullptr));
-        ProbeManager* pm(nullptr);
-        if (cache != nullptr) {
-            pm = cache->getProbeManager();
-        } else if (probeManagerDirty != nullptr) {
-            pm = probeManagerDirty;
-        }
-        listeners.push_back(new PrefetchListener(*this, pm, "StorePFtrain", false, true, true));
-        listeners.push_back(new PrefetchListener(*this, pm, "Miss", false, true, false));
-        listeners.push_back(new PrefetchListener(*this, pm, "Fill", true, false, false));
-        listeners.push_back(new PrefetchListener(*this, pm, "Hit", false, false, false));
+    if (listeners.empty() && !isSubPrefetcher && probeManager != nullptr) {
+        listeners.push_back(new PrefetchListener(*this, probeManager, "StorePFtrain", false, true, true));
+        listeners.push_back(new PrefetchListener(*this, probeManager, "Miss", false, true, false));
+        listeners.push_back(new PrefetchListener(*this, probeManager, "Fill", true, false, false));
+        listeners.push_back(new PrefetchListener(*this, probeManager, "Hit", false, false, false));
     }
 }
 
diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh
index 71ecebcfb4..532af8aa07 100644
--- a/src/mem/cache/prefetch/base.hh
+++ b/src/mem/cache/prefetch/base.hh
@@ -52,7 +52,7 @@
 #include "base/compiler.hh"
 #include "base/statistics.hh"
 #include "base/types.hh"
-#include "mem/cache/cache_blk.hh"
+#include "mem/cache/cache_probe_arg.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
 #include "sim/arch_db.hh"
@@ -63,7 +63,6 @@
 namespace gem5
 {
 
-class BaseCache;
 struct BasePrefetcherParams;
 
 GEM5_DEPRECATED_NAMESPACE(Prefetcher, prefetch);
@@ -332,14 +331,20 @@ class Base : public ClockedObject
 
   protected:
 
+    bool isSubPrefetcher;
+
+    ArchDBer* archDBer;
+
     // PARAMETERS
 
     /** Pointr to the parent cache. */
-    BaseCache* cache;
+    CacheAccessor* cache = nullptr;
 
-    bool isSubPrefetcher;
+    /** Pointer to the parent system. */
+    System* system = nullptr;
 
-    ArchDBer* archDBer;
+    /** Pointer to the parent cache's probe manager. */
+    ProbeManager *probeManager = nullptr;
 
     /** The block size of the parent cache. */
     unsigned blkSize;
@@ -390,7 +395,7 @@ class Base : public ClockedObject
     bool inMissQueue(Addr addr, bool is_secure) const;
 
     bool hasBeenPrefetched(Addr addr, bool is_secure) const;
-    bool hasBeenPrefetchedAndNotAccessed(Addr addr, bool is_secure) const;
+    bool hasEverBeenPrefetched(Addr addr, bool is_secure) const;
 
     /** Determine if addresses are on the same page */
     bool samePage(Addr a, Addr b) const;
@@ -458,20 +463,11 @@ class Base : public ClockedObject
     /** Registered tlb for address translations */
     BaseTLB * tlb;
 
-    const unsigned maxCacheLevel;
-
-    /** Proxied Prefetchers in ruby does not have cache as parent
-     * so need to set probe manager explicitly
-     */
-    ProbeManager *probeManagerDirty;
-
   public:
     Base(const BasePrefetcherParams &p);
     virtual ~Base() = default;
 
-    virtual void setCache(BaseCache *_cache);
-
-    void setPMInfoDirty(ProbeManager *pm) { probeManagerDirty = pm; }
+    virtual void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size);
 
     /**
      * Notify prefetcher of cache access (may be any access or just
@@ -501,27 +497,27 @@ class Base : public ClockedObject
     }
 
     void
-    pfHitInCache(int pf_num)
+    pfHitInCache(PrefetchSourceType pf_type)
     {
         prefetchStats.pfHitInCache++;
-        prefetchStats.pfHitInCache_srcs[pf_num]++;
-        prefetchStats.late_srcs[pf_num]++;
+        prefetchStats.pfHitInCache_srcs[pf_type]++;
+        prefetchStats.late_srcs[pf_type]++;
     }
 
     void
-    pfHitInMSHR(int pf_num)
+    pfHitInMSHR(PrefetchSourceType pf_type)
     {
         prefetchStats.pfHitInMSHR++;
-        prefetchStats.pfHitInMSHR_srcs[pf_num]++;
-        prefetchStats.late_srcs[pf_num]++;
+        prefetchStats.pfHitInMSHR_srcs[pf_type]++;
+        prefetchStats.late_srcs[pf_type]++;
     }
 
     void
-    pfHitInWB(int pf_num)
+    pfHitInWB(PrefetchSourceType pf_type)
     {
         prefetchStats.pfHitInWB++;
-        prefetchStats.pfHitInWB_srcs[pf_num]++;
-        prefetchStats.late_srcs[pf_num]++;
+        prefetchStats.pfHitInWB_srcs[pf_type]++;
+        prefetchStats.late_srcs[pf_type]++;
     }
     void streamPflate() { streamlatenum++; }
 
diff --git a/src/mem/cache/prefetch/berti.cc b/src/mem/cache/prefetch/berti.cc
index 7ec09a4c78..4e2a8492b2 100644
--- a/src/mem/cache/prefetch/berti.cc
+++ b/src/mem/cache/prefetch/berti.cc
@@ -309,8 +309,8 @@ BertiPrefetcher::notifyFill(const PacketPtr &pkt)
     }
 
     // fill latency
-    Cycles miss_refill_search_lat = ticksToCycles(200);
-    hitSearchLatency = ticksToCycles(50);
+    Cycles miss_refill_search_lat = Cycles(0);
+    hitSearchLatency = Cycles(0);
 
     HistoryTableEntry *entry =
         historyTable.findEntry(pcHash(pkt->req->getPC()), pkt->req->isSecure());
diff --git a/src/mem/cache/prefetch/cdp.cc b/src/mem/cache/prefetch/cdp.cc
index 9775ee3a81..381c011366 100644
--- a/src/mem/cache/prefetch/cdp.cc
+++ b/src/mem/cache/prefetch/cdp.cc
@@ -104,7 +104,7 @@ CDP::calculatePrefetch(const PrefetchInfo &pfi, std::vector<AddrPriority> &addre
     PrefetchSourceType pf_source = pfi.getXsMetadata().prefetchSource;
     int pf_depth = pfi.getXsMetadata().prefetchDepth;
     bool is_prefetch =
-        cache->system->getRequestorName(pfi.getRequestorId()).find("dcache.prefetcher") != std::string::npos;
+        system->getRequestorName(pfi.getRequestorId()).find("dcache.prefetcher") != std::string::npos;
     if (!miss && pfi.getDataPtr() != nullptr) {
         if (is_prefetch && enable_prf_filter[pf_source]) {
             return;
@@ -169,13 +169,13 @@ CDP::notifyWithData(const PacketPtr &pkt, bool is_l1_use, std::vector<AddrPriori
         DPRINTF(CDPdebug, "Notify with data received for addr: %#llx, pkt size: %lu\n", pkt->req->getVaddr(),
                 pkt->getSize());
 
-        auto *blk = cache->findBlock(pkt->getAddr(), pkt->isSecure());
-        if (!blk) {
+        auto *blk_data = cache->findBlock(pkt->getAddr(), pkt->isSecure());
+        if (!blk_data) {
             cdpStats.dataNotifyExitBlockNotFound++;
             return;
         }
         Request::XsMetadata pkt_meta = cache->getHitBlkXsMetadata(pkt);
-        size_t prefetch_type = cache->system->getRequestorName(pkt->req->requestorId()).find("dcache.prefetcher");
+        size_t prefetch_type = system->getRequestorName(pkt->req->requestorId()).find("dcache.prefetcher");
         int pf_depth = pkt_meta.prefetchDepth;
         PrefetchSourceType pf_source = pkt_meta.prefetchSource;
         if (!is_l1_use && prefetch_type != std::string::npos) {
@@ -184,7 +184,7 @@ CDP::notifyWithData(const PacketPtr &pkt, bool is_l1_use, std::vector<AddrPriori
                 return;
             }
         }
-        uint64_t *test_addr_start = (uint64_t *)blk->data;
+        const uint64_t *test_addr_start = (const uint64_t *)blk_data;
         unsigned max_offset = blkSize / sizeof(uint64_t);
         switch (byteOrder) {
             case ByteOrder::big:
diff --git a/src/mem/cache/prefetch/cmc.cc b/src/mem/cache/prefetch/cmc.cc
index a313aaa012..156a199f22 100644
--- a/src/mem/cache/prefetch/cmc.cc
+++ b/src/mem/cache/prefetch/cmc.cc
@@ -135,7 +135,7 @@ CMCPrefetcher::doPrefetch(const PrefetchInfo &pfi, std::vector<AddrPriority> &ad
             // addresses.push_back(AddrPriority(addr, mixedNum, PrefetchSourceType::CMC));
             if (sendPFWithFilter(pfi, addr, addresses, priority, PrefetchSourceType::CMC)) {
                 num_send++;
-                if (maxCacheLevel == 3 && num_send > 24) {
+                if (num_send > 24) {
                     addresses.back().pfahead = true;
                     addresses.back().pfahead_host = 3;
                 } else if (num_send > 4) {
diff --git a/src/mem/cache/prefetch/composite_with_worker.cc b/src/mem/cache/prefetch/composite_with_worker.cc
index 02cc83d565..b35de3c98a 100644
--- a/src/mem/cache/prefetch/composite_with_worker.cc
+++ b/src/mem/cache/prefetch/composite_with_worker.cc
@@ -41,7 +41,7 @@ CompositeWithWorkerPrefetcher::postNotifyInsert(const PacketPtr &trigger_pkt, st
         if (!samePage(addr_prio.addr, pfi.getAddr())) {
             statsQueued.pfSpanPage += 1;
 
-            if (hasBeenPrefetched(trigger_pkt->getAddr(), trigger_pkt->isSecure())) {
+            if (hasEverBeenPrefetched(trigger_pkt->getAddr(), trigger_pkt->isSecure())) {
                 statsQueued.pfUsefulSpanPage += 1;
             }
         }
@@ -65,9 +65,9 @@ CompositeWithWorkerPrefetcher::postNotifyInsert(const PacketPtr &trigger_pkt, st
 }
 
 void
-CompositeWithWorkerPrefetcher::setCache(BaseCache *_cache)
+CompositeWithWorkerPrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size)
 {
-    Base::setCache(_cache);
+    Base::setParentInfo(sys, pm, _cache, blk_size);
 }
 
 }  // namespace prefetch
diff --git a/src/mem/cache/prefetch/composite_with_worker.hh b/src/mem/cache/prefetch/composite_with_worker.hh
index f569199254..870615ae59 100644
--- a/src/mem/cache/prefetch/composite_with_worker.hh
+++ b/src/mem/cache/prefetch/composite_with_worker.hh
@@ -23,7 +23,7 @@ class CompositeWithWorkerPrefetcher: public WorkerPrefetcher
 
     bool hasHintsWaiting() override { return !localBuffer.empty(); }
 
-    void setCache(BaseCache *_cache) override;
+    void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override;
 
     void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override;
 
diff --git a/src/mem/cache/prefetch/l2_composite_with_worker.cc b/src/mem/cache/prefetch/l2_composite_with_worker.cc
index 83854e0292..62050eb340 100644
--- a/src/mem/cache/prefetch/l2_composite_with_worker.cc
+++ b/src/mem/cache/prefetch/l2_composite_with_worker.cc
@@ -60,11 +60,11 @@ L2CompositeWithWorkerPrefetcher::pfHitNotify(float accuracy, PrefetchSourceType
 }
 
 void
-L2CompositeWithWorkerPrefetcher::setCache(BaseCache *_cache)
+L2CompositeWithWorkerPrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size)
 {
-    cdp->setCache(_cache);
+    cdp->setParentInfo(sys, pm, _cache, blk_size);
     cdp->setStatsPtr(&prefetchStats);
-    CompositeWithWorkerPrefetcher::setCache(_cache);
+    CompositeWithWorkerPrefetcher::setParentInfo(sys, pm, _cache, blk_size);
 }
 
 void
diff --git a/src/mem/cache/prefetch/l2_composite_with_worker.hh b/src/mem/cache/prefetch/l2_composite_with_worker.hh
index fcd7d6f62e..3cd0939c8f 100644
--- a/src/mem/cache/prefetch/l2_composite_with_worker.hh
+++ b/src/mem/cache/prefetch/l2_composite_with_worker.hh
@@ -29,7 +29,7 @@ class L2CompositeWithWorkerPrefetcher : public CompositeWithWorkerPrefetcher
     void rxHint(BaseMMU::Translation *dpp) override;
     void pfHitNotify(float accuracy, PrefetchSourceType pf_source, const PacketPtr &pkt) override;
 
-    void setCache(BaseCache *_cache) override;
+    void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override;
 
     void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override;
 
diff --git a/src/mem/cache/prefetch/multi.cc b/src/mem/cache/prefetch/multi.cc
index 8137e6a231..c67b634192 100644
--- a/src/mem/cache/prefetch/multi.cc
+++ b/src/mem/cache/prefetch/multi.cc
@@ -54,10 +54,10 @@ Multi::Multi(const MultiPrefetcherParams &p)
 }
 
 void
-Multi::setCache(BaseCache *_cache)
+Multi::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size)
 {
     for (auto pf : prefetchers)
-        pf->setCache(_cache);
+        pf->setParentInfo(sys, pm, _cache, blk_size);
 }
 
 Tick
diff --git a/src/mem/cache/prefetch/multi.hh b/src/mem/cache/prefetch/multi.hh
index 46469729ca..140e8cda6b 100644
--- a/src/mem/cache/prefetch/multi.hh
+++ b/src/mem/cache/prefetch/multi.hh
@@ -57,7 +57,7 @@ class Multi : public Base
     Multi(const MultiPrefetcherParams &p);
 
   public:
-    void setCache(BaseCache *_cache) override;
+    void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override;
     void addTLB(BaseTLB* _t) override;
     PacketPtr getPacket() override;
     Tick nextPrefetchReadyTime() const override;
diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc
index 246351c00c..f38f6e91cc 100644
--- a/src/mem/cache/prefetch/queued.cc
+++ b/src/mem/cache/prefetch/queued.cc
@@ -256,7 +256,7 @@ Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi)
         if (!samePage(addr_prio.addr, pfi.getAddr())) {
             statsQueued.pfSpanPage += 1;
 
-            if (hasBeenPrefetchedAndNotAccessed(pkt->getAddr(), pkt->isSecure())) {
+            if (hasBeenPrefetched(pkt->getAddr(), pkt->isSecure())) {
                 statsQueued.pfUsefulSpanPage += 1;
             }
         }
@@ -379,7 +379,7 @@ Queued::translationComplete(DeferredPacket *dp, bool failed)
                 statsQueued.pfInCache++;
                 DPRINTF(HWPrefetch, "Dropping redundant in "
                         "cache/MSHR prefetch addr:%#x\n", target_paddr);
-            } else if (target_paddr < 0x80000000) {
+            } else if (!system->isMemAddr(target_paddr)) {
                 DPRINTF(HWPrefetch, "wrong paddr of prefetch:%#x\n", target_paddr);
 
             } else {
@@ -552,7 +552,7 @@ Queued::insert(const PacketPtr &pkt, PrefetchInfo &new_pfi, const AddrPriority &
                 "cache/MSHR prefetch addr:%#x\n", target_paddr);
         return;
     }
-    if (has_target_pa && (target_paddr < 0x80000000)) {
+    if (has_target_pa && !system->isMemAddr(target_paddr)) {
         DPRINTF(HWPrefetch, "wrong paddr of prefetch:%#x\n", target_paddr);
         return;
     }
@@ -575,7 +575,7 @@ Queued::insert(const PacketPtr &pkt, PrefetchInfo &new_pfi, const AddrPriority &
     } else {
         // Add the translation request and try to resolve it later
         dpp.setTranslationRequest(translation_req);
-        dpp.tc = cache->system->threads[translation_req->contextId()];
+        dpp.tc = system->threads[translation_req->contextId()];
         DPRINTF(HWPrefetch, "Prefetch queued with no translation. "
                 "addr:%#x priority: %3d\n", new_pfi.getAddr(), priority);
         addToQueue(pfqMissingTranslation, dpp);
@@ -605,10 +605,10 @@ Queued::addToQueue(std::list<DeferredPacket> &queue,
         if (dpp.pfahead) {
             // l1 can not process l3 pfahead request
             // but l3 can process l1 request
-            if (dpp.pfahead_host > cache->level()) {
-                panic("Prefetch req from src %i heading to l%i, but l%i can not process it\n",
-                      dpp.pfInfo.getXsMetadata().prefetchSource, dpp.pfahead_host, cache->level());
-            }
+            // if (dpp.pfahead_host > cache->level()) {
+            //     panic("Prefetch req from src %i heading to l%i, but l%i can not process it\n",
+            //           dpp.pfInfo.getXsMetadata().prefetchSource, dpp.pfahead_host, cache->level());
+            // }
         }
         queue_size = queueSize;
         queue_name = "PFQ";
diff --git a/src/mem/cache/prefetch/sms.cc b/src/mem/cache/prefetch/sms.cc
index 0d19d4f3b3..424a52eeb6 100644
--- a/src/mem/cache/prefetch/sms.cc
+++ b/src/mem/cache/prefetch/sms.cc
@@ -23,7 +23,7 @@ XSCompositePrefetcher::XSCompositePrefetcher(const XSCompositePrefetcherParams &
           p.pht_replacement_policy,
           PhtEntry(2 * (regionBlks - 1), SatCounter8(3, 2))),
       phtPFAhead(p.pht_pf_ahead),
-      phtPFLevel(std::min(p.pht_pf_level, (int) maxCacheLevel)),
+      phtPFLevel(std::min(p.pht_pf_level, (int) 3)),
       stats(this),
       pfBlockLRUFilter(pfFilterSize),
       pfPageLRUFilter(pfPageFilterSize),
@@ -134,10 +134,9 @@ XSCompositePrefetcher::calculatePrefetch(const PrefetchInfo &pfi, std::vector<Ad
         if (streamPFAhead) {
             Addr pf_tgt_addr_l2 = decr ? pf_tgt_addr - 48 * blkSize : pf_tgt_addr + 48 * blkSize;  // depth here?
             sendStreamPF(pfi, pf_tgt_addr_l2, addresses, pfPageLRUFilterL2, decr, 2);
-            if (maxCacheLevel >= 3) {
-                Addr pf_tgt_addr_l3 = decr ? pf_tgt_addr - 256 * blkSize : pf_tgt_addr + 256 * blkSize;  // depth here?
-                sendStreamPF(pfi, pf_tgt_addr_l3, addresses, pfPageLRUFilterL3, decr, 3);
-            }
+
+            Addr pf_tgt_addr_l3 = decr ? pf_tgt_addr - 256 * blkSize : pf_tgt_addr + 256 * blkSize;  // depth here?
+            sendStreamPF(pfi, pf_tgt_addr_l3, addresses, pfPageLRUFilterL3, decr, 3);
         }
     }
 
@@ -640,21 +639,21 @@ XSCompositePrefetcher::XSCompositeStats::XSCompositeStats(statistics::Group *par
 }
 
 void
-XSCompositePrefetcher::setCache(BaseCache *_cache)
+XSCompositePrefetcher::setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size)
 {
-    Base::setCache(_cache);
+    Base::setParentInfo(sys, pm, _cache, blk_size);
 
-    largeBOP->setCache(_cache);
-    smallBOP->setCache(_cache);
-    learnedBOP->setCache(_cache);
+    largeBOP->setParentInfo(sys, pm, _cache, blk_size);
+    smallBOP->setParentInfo(sys, pm, _cache, blk_size);
+    learnedBOP->setParentInfo(sys, pm, _cache, blk_size);
 
-    berti->setCache(_cache);
+    berti->setParentInfo(sys, pm, _cache, blk_size);
 
     if (cmc)
-        cmc->setCache(_cache);
+        cmc->setParentInfo(sys, pm, _cache, blk_size);
 
     if (ipcp)
-        ipcp->setCache(_cache);
+        ipcp->setParentInfo(sys, pm, _cache, blk_size);
 }
 
 }  // prefetch
diff --git a/src/mem/cache/prefetch/sms.hh b/src/mem/cache/prefetch/sms.hh
index cd5f59170e..83ca0bc1c5 100644
--- a/src/mem/cache/prefetch/sms.hh
+++ b/src/mem/cache/prefetch/sms.hh
@@ -203,7 +203,7 @@ class XSCompositePrefetcher : public Queued
           hintDownStream->notifyIns(ins_num);
         }
     }
-    void setCache(BaseCache *_cache) override;
+    void setParentInfo(System *sys, ProbeManager *pm, CacheAccessor* _cache, unsigned blk_size) override;
 };
 
 }
diff --git a/src/mem/cache/prefetch/worker.hh b/src/mem/cache/prefetch/worker.hh
index 7e69b5e8a7..f94cca02f7 100644
--- a/src/mem/cache/prefetch/worker.hh
+++ b/src/mem/cache/prefetch/worker.hh
@@ -51,11 +51,12 @@ class WorkerPrefetcher : public Queued
     void rxHint(BaseMMU::Translation *dpp) override;
     std::pair<long, long> rxMembusRatio(RequestorID requestorId) override
     {
-        long totalMissCount = cache->stats.cmd[MemCmd::ReadExReq]->misses.total() +
-                              cache->stats.cmd[MemCmd::ReadSharedReq]->misses.total();
-        long missCount = cache->stats.cmd[MemCmd::ReadExReq]->misses[requestorId].value() +
-                         cache->stats.cmd[MemCmd::ReadSharedReq]->misses[requestorId].value();
-        return std::pair<long, long>(missCount, totalMissCount);
+        // long totalMissCount = cache->stats.cmd[MemCmd::ReadExReq]->misses.total() +
+        //                       cache->stats.cmd[MemCmd::ReadSharedReq]->misses.total();
+        // long missCount = cache->stats.cmd[MemCmd::ReadExReq]->misses[requestorId].value() +
+        //                  cache->stats.cmd[MemCmd::ReadSharedReq]->misses[requestorId].value();
+        // return std::pair<long, long>(missCount, totalMissCount);
+        return std::pair<long, long>(0, 0);
     };
     void notifyIns(int ins_num) override
     {
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index ae2e20c4a3..4fc3019ae4 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -93,6 +93,7 @@ def MakeInclude(source):
 MakeInclude('slicc_interface/AbstractCacheEntry.hh')
 MakeInclude('slicc_interface/Message.hh')
 MakeInclude('slicc_interface/RubyRequest.hh')
+MakeInclude('slicc_interface/XsPFMetaData.hh')
 
 # External types
 MakeInclude('common/Address.hh')
diff --git a/src/mem/ruby/network/BasicLink.py b/src/mem/ruby/network/BasicLink.py
index a275d9bd85..4515506c32 100644
--- a/src/mem/ruby/network/BasicLink.py
+++ b/src/mem/ruby/network/BasicLink.py
@@ -54,7 +54,7 @@ class BasicExtLink(BasicLink):
 
     ext_node = Param.RubyController("External node")
     int_node = Param.BasicRouter("ID of internal node")
-    bandwidth_factor = 16  # only used by simple network
+    bandwidth_factor = 64  # only used by simple network
 
 
 class BasicIntLink(BasicLink):
@@ -70,4 +70,4 @@ class BasicIntLink(BasicLink):
     dst_inport = Param.String("", "Inport direction at dst router")
 
     # only used by simple network
-    bandwidth_factor = 16
+    bandwidth_factor = 64
diff --git a/src/mem/ruby/network/MessageBuffer.cc b/src/mem/ruby/network/MessageBuffer.cc
index 9a4439a538..a9ecbf163a 100644
--- a/src/mem/ruby/network/MessageBuffer.cc
+++ b/src/mem/ruby/network/MessageBuffer.cc
@@ -48,6 +48,7 @@
 #include "base/stl_helpers.hh"
 #include "debug/RubyQueue.hh"
 #include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/Sequencer.hh"
 
 namespace gem5
 {
@@ -355,6 +356,32 @@ MessageBuffer::unregisterDequeueCallback()
     m_dequeue_callback = nullptr;
 }
 
+void
+MessageBuffer::notifyMissCallback(Tick current_time, Sequencer& sequencer)
+{
+    int num_readys = 0;
+    for (auto& msg : m_prio_heap) {
+        if (msg->getLastEnqueueTime() <= current_time) {
+            num_readys++;
+            auto req = dynamic_cast<const RubyRequest *>(msg.get());
+            sequencer.TBEFullCancel(req->m_LineAddress);
+        }
+    }
+    DPRINTF(RubyQueue, "MessageBuffer: has %d readys but not dequeue, need notifyMissCallback\n", num_readys);
+}
+
+bool
+MessageBuffer::hasPrefetchRequest(Addr addr)
+{
+    for (auto& msg : m_prio_heap) {
+        auto req = dynamic_cast<const RubyRequest *>(msg.get());
+        if (req->m_LineAddress == makeLineAddress(addr) && req->m_Prefetch == PrefetchBit_Yes) {
+            return true;
+        }
+    }
+    return false;
+}
+
 void
 MessageBuffer::clear()
 {
@@ -522,6 +549,19 @@ MessageBuffer::isReady(Tick current_time) const
                        (m_dequeues_this_cy < m_max_dequeue_rate);
     bool is_ready = (m_prio_heap.size() > 0) &&
                    (m_prio_heap.front()->getLastEnqueueTime() <= current_time);
+
+    if (debug::RubyQueue) {
+        int num_readys = 0;
+        for (auto& msg : m_prio_heap)
+        {
+            if (msg->getLastEnqueueTime() <= current_time)
+            {
+                num_readys ++;
+            }
+        }
+        DPRINTF(RubyQueue, "MessageBuffer: has %d readys\n", num_readys);
+    }
+
     if (!can_dequeue && is_ready) {
         // Make sure the Consumer executes next cycle to dequeue the ready msg
         m_consumer->scheduleEvent(Cycles(1));
diff --git a/src/mem/ruby/network/MessageBuffer.hh b/src/mem/ruby/network/MessageBuffer.hh
index 03a0454433..b44517ba4a 100644
--- a/src/mem/ruby/network/MessageBuffer.hh
+++ b/src/mem/ruby/network/MessageBuffer.hh
@@ -71,6 +71,8 @@ namespace gem5
 namespace ruby
 {
 
+class Sequencer;
+
 class MessageBuffer : public SimObject
 {
   public:
@@ -145,6 +147,10 @@ class MessageBuffer : public SimObject
     void registerDequeueCallback(std::function<void()> callback);
     void unregisterDequeueCallback();
 
+    void notifyMissCallback(Tick current_time, Sequencer& sequencer);
+
+    bool hasPrefetchRequest(Addr addr);
+
     void recycle(Tick current_time, Tick recycle_latency);
     bool isEmpty() const { return m_prio_heap.size() == 0; }
     bool isStallMapEmpty() { return m_stall_msg_map.size() == 0; }
diff --git a/src/mem/ruby/network/simple/SimpleNetwork.py b/src/mem/ruby/network/simple/SimpleNetwork.py
index e52333b24d..ea0dfcecee 100644
--- a/src/mem/ruby/network/simple/SimpleNetwork.py
+++ b/src/mem/ruby/network/simple/SimpleNetwork.py
@@ -55,7 +55,7 @@ class SimpleNetwork(RubyNetwork):
         "default internal buffer size for links and\
                                 routers; 0 indicates infinite buffering",
     )
-    endpoint_bandwidth = Param.Int(1000, "bandwidth adjustment factor")
+    endpoint_bandwidth = Param.Int(2048, "bandwidth adjustment factor")
 
     physical_vnets_channels = VectorParam.Int(
         [],
diff --git a/src/mem/ruby/protocol/RubySlicc_Types.sm b/src/mem/ruby/protocol/RubySlicc_Types.sm
index 444111a1b2..881b8b8b76 100644
--- a/src/mem/ruby/protocol/RubySlicc_Types.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Types.sm
@@ -64,16 +64,6 @@ structure(OutPort, external = "yes", primitive="yes") {
     bool isDeferredMsgMapEmpty(Addr addr);
 }
 
-structure(InPort, external = "yes", primitive="yes") {
-  bool isReady(Tick current_time);
-  Tick dequeue(Tick current_time);
-  void recycle(Tick current_time, Tick recycle_latency);
-  bool isEmpty();
-  bool isStallMapEmpty();
-  int getStallMapSize();
-  bool hasStalledMsg(Addr addr);
-}
-
 external_type(NodeID, default="0", primitive="yes");
 external_type(MachineID);
 
@@ -133,6 +123,7 @@ structure (Sequencer, external = "yes") {
                     Cycles, Cycles, Cycles);
 
   void notifyMissCallback(Addr, bool, bool);
+  void TBEFullCancel(Addr);
 
   void writeCallback(Addr, DataBlock);
   void writeCallback(Addr, DataBlock, bool);
@@ -166,11 +157,27 @@ structure (Sequencer, external = "yes") {
   bool checkResourceAvailable(CacheResourceType, Addr);
 }
 
+structure(InPort, external = "yes", primitive="yes") {
+  bool isReady(Tick current_time);
+  Tick dequeue(Tick current_time);
+  void recycle(Tick current_time, Tick recycle_latency);
+  bool isEmpty();
+  bool isStallMapEmpty();
+  int getStallMapSize();
+  bool hasStalledMsg(Addr addr);
+  void notifyMissCallback(Tick current_time, Sequencer sequencer);
+}
+
 structure (HTMSequencer, interface="Sequencer", external = "yes") {
   // hardware transactional memory
   void htmCallback(Addr, HtmCallbackMode, HtmFailedInCacheReason);
 }
 
+structure (XsPFMetaData, desc="...", external="yes") {
+  bool validXsMetadata,               desc="";
+  int prefetchDepth,                  desc="";
+}
+
 structure(RubyRequest, desc="...", interface="Message", external="yes") {
   Addr LineAddress,       desc="Line address for this request";
   Addr PhysicalAddress,   desc="Physical address for this request";
@@ -193,6 +200,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   bool isSLCSet,             default="false",desc="If flag is set, bypass GPU L1 and L2 caches";
 
   RequestPtr getRequestPtr();
+  XsPFMetaData getXsPFMeta();
 }
 
 structure(AbstractCacheEntry, primitive="yes", external = "yes") {
@@ -230,6 +238,7 @@ structure (CacheMemory, external = "yes") {
   void htmCommitTransaction();
   void htmAbortTransaction();
 
+  int level();
   int getCacheSize();
   int getNumBlocks();
   Addr getAddressAtIdx(int);
@@ -266,11 +275,14 @@ structure (RubyPrefetcher, external = "yes") {
 }
 
 structure(RubyPrefetcherProxy, external = "yes") {
-    void notifyPfHit(RequestPtr, bool, DataBlock);
-    void notifyPfMiss(RequestPtr, bool, DataBlock);
+    void notifyPfHit(RequestPtr, bool, XsPFMetaData, DataBlock);
+    void notifyPfMiss(RequestPtr, bool, XsPFMetaData, DataBlock);
     void notifyPfFill(RequestPtr, DataBlock, bool);
-    void notifyPfEvict(Addr, bool, RequestorID);
+    void notifyPfEvict(Addr, bool, XsPFMetaData, RequestorID);
     void completePrefetch(Addr);
+    void pfHitInCache(XsPFMetaData);
+    void notifyHitToDownStream(RequestPtr);
+    void offloadToDownStream();
     // SLICC controller must define its own regProbePoints and call
     // this for every RubyPrefetcherProxy object present
     void regProbePoints();
diff --git a/src/mem/ruby/protocol/RubySlicc_Util.sm b/src/mem/ruby/protocol/RubySlicc_Util.sm
index 104c7c034c..2cbbbb1435 100644
--- a/src/mem/ruby/protocol/RubySlicc_Util.sm
+++ b/src/mem/ruby/protocol/RubySlicc_Util.sm
@@ -40,6 +40,7 @@
 
 // Miscallaneous Functions
 
+void warn(std::string msg);
 void error(std::string msg);
 void assert(bool condition);
 Cycles zero_time();
@@ -61,3 +62,6 @@ structure(BoolVec, external="yes") {
 }
 int countBoolVec(BoolVec bVec);
 RequestorID getRequestorID(RequestPtr req);
+XsPFMetaData getRequestXsMetaData(RequestPtr req);
+void setRequestXsMetaData(RequestPtr req, XsPFMetaData pfmeta);
+bool XsMetaIsNotNull(XsPFMetaData pfmeta);
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
index f85a1aca6e..cb55052602 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-actions.sm
@@ -144,6 +144,7 @@ action(AllocateTBE_SeqRequest, desc="") {
       out_msg.seqReq := in_msg.getRequestPtr();
       out_msg.isSeqReqValid := true;
       assert(in_msg.Prefetch == PrefetchBit:No);
+      out_msg.is_from_cpu := true;
       out_msg.is_local_pf := false;
       out_msg.is_remote_pf := false;
       out_msg.txnId := max_outstanding_transactions;
@@ -192,6 +193,7 @@ action(AllocateTBE_SeqDvmRequest, desc="") {
       out_msg.accAddr := in_msg.tlbiTransactionUid;
       out_msg.accSize := blockSize;
       assert(in_msg.Prefetch == PrefetchBit:No);
+      out_msg.is_from_cpu := true;
       out_msg.is_local_pf := false;
       out_msg.is_remote_pf := false;
 
@@ -236,6 +238,7 @@ action(AllocateTBE_PfRequest, desc="Allocate TBE for prefetch request") {
       assert(in_msg.Prefetch != PrefetchBit:No);
       out_msg.is_local_pf := true;
       out_msg.is_remote_pf := false;
+      out_msg.pfmeta := in_msg.getXsPFMeta();
       out_msg.txnId := max_outstanding_transactions;
 
       if (in_msg.Type == RubyRequestType:LD) {
@@ -248,6 +251,10 @@ action(AllocateTBE_PfRequest, desc="Allocate TBE for prefetch request") {
     }
   }
   pfInPort.dequeue(clockEdge());
+  if (!storTBEs.areNSlotsAvailable(1))
+  {
+    pfProxy.offloadToDownStream();
+  }
 }
 
 action(Initiate_Request, desc="") {
@@ -788,7 +795,11 @@ action(Initiate_LoadHit, desc="") {
 }
 
 action(Initiate_LoadMiss, desc="") {
-  if (tbe.doCacheFill) {
+  if (is_HN) {
+    tbe.actions.push(Event:SendReadNoSnp);
+    tbe.actions.push(Event:CheckCacheFill);
+    tbe.actions.push(Event:TagArrayWrite);
+  } else if (tbe.doCacheFill) {
     tbe.actions.push(Event:SendReadShared);
     tbe.actions.push(Event:CheckCacheFill);
     tbe.actions.push(Event:TagArrayWrite);
@@ -1543,7 +1554,7 @@ action(Send_ReadShared, desc="") {
 
 action(Send_ReadNoSnp, desc="") {
   assert(is_HN);
-  assert((tbe.use_DMT == false) ||
+  assert((tbe.use_DMT == false || tbe.is_local_pf) ||
          ((tbe.reqType == CHIRequestType:AtomicReturn) ||
           (tbe.reqType == CHIRequestType:AtomicNoReturn)));
 
@@ -2265,7 +2276,7 @@ action(UpdateDirState_FromReqDataResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDirState_FromReqDataResp", tbe);
 }
 
 action(UpdateDirState_FromSnpDataResp, desc="") {
@@ -2338,7 +2349,7 @@ action(UpdateDirState_FromSnpDataResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDirState_FromSnpDataResp", tbe);
 }
 
 action(UpdateDataState_FromReqDataResp, desc="") {
@@ -2420,7 +2431,7 @@ action(UpdateDataState_FromReqDataResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDataState_FromReqDataResp", tbe);
 }
 
 action(UpdateDataState_FromWUDataResp, desc="") {
@@ -2442,7 +2453,7 @@ action(UpdateDataState_FromWUDataResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDataState_FromWUDataResp", tbe);
 }
 
 action(UpdateDataState_FromADataResp, desc="") {
@@ -2463,7 +2474,7 @@ action(UpdateDataState_FromADataResp, desc="") {
              (tbe.reqType == CHIRequestType:AtomicNoReturn))){
     tbe.dataMaybeDirtyUpstream := false;
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDataState_FromADataResp", tbe);
 }
 
 action(UpdateDataState_FromCUResp, desc="") {
@@ -2475,7 +2486,7 @@ action(UpdateDataState_FromCUResp, desc="") {
     // self and upstream may have been invalidated while waiting for this
     // expect to follow up with a ReadUnique
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDataState_FromCUResp", tbe);
 }
 
 action(UpdateDataState_FromSnpDataResp, desc="") {
@@ -2521,7 +2532,7 @@ action(UpdateDataState_FromSnpDataResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDataState_FromSnpDataResp", tbe);
 }
 
 action(UpdateDirState_FromReqResp, desc="") {
@@ -2549,7 +2560,7 @@ action(UpdateDirState_FromReqResp, desc="") {
       }
     }
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDirState_FromReqResp", tbe);
 }
 
 action(UpdateDirState_FromSnpResp, desc="") {
@@ -2611,7 +2622,7 @@ action(UpdateDirState_FromSnpResp, desc="") {
     tbe.dataMaybeDirtyUpstream := tbe.dir_ownerExists;
 
   }
-  printTBEState(tbe);
+  printTBEState("UpdateDirState_FromSnpResp", tbe);
 }
 
 action(Receive_ReqResp, desc="") {
@@ -2777,7 +2788,7 @@ action(Send_CompData, desc="") {
 
   tbe.snd_destination := tbe.requestor;
   setupPendingSend(tbe);
-  printTBEState(tbe);
+  printTBEState("Send_CompData", tbe);
 }
 
 action(Send_WBData, desc="") {
@@ -3250,7 +3261,7 @@ action(Send_CompData_AR, desc="") {
   tbe.requestorToBeOwner := false;
   tbe.snd_destination := tbe.requestor;
   setupPendingSend(tbe);
-  printTBEState(tbe);
+  printTBEState("Send_CompData_AR", tbe);
 
 }
 
@@ -3582,23 +3593,31 @@ action(Callback_LoadHit, desc="") {
   }
 }
 
+action(Callback_PrefetchLate, desc="load miss due to prefetch late") {
+  assert(is_valid(tbe));
+  if (is_dcache && tbe.is_from_cpu && tbe.is_local_pf && use_prefetcher) {
+    XsPFMetaData late_in_tbe := tbe.pfmeta;
+    pfProxy.notifyPfMiss(tbe.seqReq, true, late_in_tbe, tbe.dataBlk);
+  }
+}
+
 action(Callback_DelayedL1DResult_Load, desc="L1D load delayed due to load miss") {
   assert(is_valid(tbe));
-  if (is_dcache) {
+  if (is_dcache && tbe.is_from_cpu) {
     sequencer.notifyMissCallback(tbe.addr, false, false);
   }
 }
 
 action(Callback_DelayedL1DResult_Store, desc="L1D store delayed due to store") {
   assert(is_valid(tbe));
-  if (is_dcache) {
+  if (is_dcache && tbe.is_from_cpu) {
     sequencer.notifyMissCallback(tbe.addr, true, false);
   }
 }
 
 action(Callback_DelayedL1DResult_Busy, desc="L1D transaction delayed due to busy") {
   assert(is_valid(tbe));
-  if (is_dcache) {
+  if (is_dcache && tbe.is_from_cpu) {
     if (tbe.reqType == CHIRequestType:Load) {
       sequencer.notifyMissCallback(tbe.addr, false, true);
     } else if (tbe.reqType == CHIRequestType:Store) {
@@ -3650,6 +3669,7 @@ action(Callback_ExpressPrefetchHit, desc="") {
   cache.profilePrefetchHit();
   peek(reqRdyPort, CHIRequestMsg) {
     assert(in_msg.is_local_pf);
+    pfProxy.pfHitInCache(in_msg.pfmeta);
     pfProxy.completePrefetch(in_msg.addr);
   }
 }
@@ -3763,7 +3783,7 @@ action(Profile_Miss, desc="") {
     cache.profilePrefetchMiss();
   }
   // notify prefetcher about this demand miss
-  if (use_prefetcher && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) {
+  if (use_prefetcher && !isAtomicReqType(tbe.reqType) && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) {
     bool is_read := false;
     if (isReadReqType(tbe.reqType)) {
       is_read := true;
@@ -3773,7 +3793,8 @@ action(Profile_Miss, desc="") {
 
     // FIXME: this dataBlk is likely to have stale data. This should be fixed
     // if our prefetcher uses cached data to make prefetch decisions.
-    pfProxy.notifyPfMiss(tbe.seqReq, is_read, tbe.dataBlk);
+    XsPFMetaData empty;
+    pfProxy.notifyPfMiss(tbe.seqReq, is_read, empty, tbe.dataBlk);
   }
 }
 
@@ -3790,14 +3811,15 @@ action(Profile_Hit, desc="") {
     cache.profilePrefetchHit();
   }
   // notify prefetcher about this demand hit
-  if (use_prefetcher && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) {
+  if (use_prefetcher && !isAtomicReqType(tbe.reqType) && tbe.isSeqReqValid && (is_demand || is_remote_can_notify)) {
     bool is_read := false;
     if (isReadReqType(tbe.reqType)) {
       is_read := true;
     } else {
       assert(isWriteReqType(tbe.reqType));
     }
-    pfProxy.notifyPfHit(tbe.seqReq, is_read, tbe.dataBlk);
+    pfProxy.notifyPfHit(tbe.seqReq, is_read, cache_entry.pfmeta, tbe.dataBlk);
+    pfProxy.notifyHitToDownStream(tbe.seqReq);
 
     cache_entry.HWPrefetched := false;
   }
@@ -3807,11 +3829,16 @@ action(Profile_Fill, desc="") {
   assert(is_valid(tbe));
   assert(is_valid(cache_entry));
   if (use_prefetcher && tbe.isSeqReqValid) {
-
     cache_entry.HWPrefetched := tbe.is_local_pf ||
-                          (tbe.is_remote_pf &&
-                            (upstream_prefetch_trains_prefetcher == false));
+                          (tbe.is_remote_pf && upstream_prefetch_trains_prefetcher);
+    cache_entry.everPrefetched := cache_entry.HWPrefetched;
     cache_entry.requestor := getRequestorID(tbe.seqReq);
+    if (cache_entry.HWPrefetched) {
+      cache_entry.pfmeta := getRequestXsMetaData(tbe.seqReq);
+    } else {
+      XsPFMetaData empty;
+      cache_entry.pfmeta := empty;
+    }
 
     // Prefetchers that use this info require notifications from both
     // demand and pf fills (unlike notifyPfHit/notifyPfMiss)
@@ -3829,7 +3856,7 @@ action(Profile_Eviction, desc="") {
     sequencer.evictionCallback(address);
   }
   if (use_prefetcher && is_valid(cache_entry)) {
-    pfProxy.notifyPfEvict(address, cache_entry.HWPrefetched, cache_entry.requestor);
+    pfProxy.notifyPfEvict(address, cache_entry.HWPrefetched, cache_entry.pfmeta, cache_entry.requestor);
   }
 }
 
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
index d2272d341e..2357fed9f9 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-funcs.sm
@@ -316,6 +316,10 @@ bool inCache(Addr addr, bool is_secure) {
   }
 }
 
+int level() {
+  return cache.level();
+}
+
 bool hasBeenPrefetched(Addr addr, bool is_secure, RequestorID requestor) {
   CacheEntry entry := getCacheEntry(makeLineAddress(addr));
   if (is_valid(entry)) {
@@ -334,6 +338,25 @@ bool hasBeenPrefetched(Addr addr, bool is_secure) {
   }
 }
 
+bool hasEverBeenPrefetched(Addr addr, bool is_secure) {
+  CacheEntry entry := getCacheEntry(makeLineAddress(addr));
+  if (is_valid(entry)) {
+    return entry.everPrefetched;
+  } else {
+    return false;
+  }
+}
+
+XsPFMetaData getHitBlkXsMetadata(Addr addr, bool is_secure) {
+  CacheEntry entry := getCacheEntry(makeLineAddress(addr));
+  XsPFMetaData empty;
+  if (is_valid(entry)) {
+    return entry.pfmeta;
+  } else {
+    return empty;
+  }
+}
+
 bool inMissQueue(Addr addr, bool is_secure) {
   Addr line_addr := makeLineAddress(addr);
   TBE tbe := getCurrentActiveTBE(line_addr);
@@ -344,6 +367,16 @@ bool coalesce() {
   return false;
 }
 
+AbstractCacheEntry CacheEntryToAbstract(CacheEntry entry), return_by_pointer="yes";
+
+AbstractCacheEntry findBlock(Addr addr, bool is_secure), return_by_pointer="yes" {
+  CacheEntry entry := getCacheEntry(makeLineAddress(addr));
+  if (is_valid(entry)) {
+    return CacheEntryToAbstract(entry);
+  }
+  return OOD;
+}
+
 void notifyCoalesced(Addr addr, RubyRequestType type, RequestPtr req,
                      DataBlock data_blk, bool was_miss) {
   DPRINTF(RubySlicc, "notifyCoalesced(addr=%#x, type=%s, was_miss=%d)\n",
@@ -357,10 +390,11 @@ void notifyCoalesced(Addr addr, RubyRequestType type, RequestPtr req,
     bool is_read := (type == RubyRequestType:LD) ||
                     (type == RubyRequestType:Load_Linked) ||
                     (type == RubyRequestType:IFETCH);
+    XsPFMetaData empty;
     if (was_miss) {
-      pfProxy.notifyPfMiss(req, is_read, data_blk);
+      pfProxy.notifyPfMiss(req, is_read, empty, data_blk);
     } else {
-      pfProxy.notifyPfHit(req, is_read, data_blk);
+      pfProxy.notifyPfHit(req, is_read, empty, data_blk);
     }
   }
 }
@@ -444,8 +478,10 @@ TBE allocateRequestTBE(Addr addr, CHIRequestMsg in_msg), return_by_pointer="yes"
 
   tbe.isSeqReqValid := in_msg.isSeqReqValid;
   tbe.seqReq := in_msg.seqReq;
+  tbe.is_from_cpu := in_msg.is_from_cpu;
   tbe.is_local_pf := in_msg.is_local_pf;
   tbe.is_remote_pf := in_msg.is_remote_pf;
+  tbe.pfmeta := in_msg.pfmeta;
 
   tbe.atomic_op.clear();
   tbe.atomic_op.orMask(in_msg.atomic_op);
@@ -485,6 +521,7 @@ TBE allocateDvmRequestTBE(Addr txnId, CHIRequestMsg in_msg), return_by_pointer="
 
   tbe.isSeqReqValid := in_msg.isSeqReqValid;
   tbe.seqReq := in_msg.seqReq;
+  tbe.is_from_cpu := in_msg.is_from_cpu;
   tbe.is_local_pf := in_msg.is_local_pf;
   tbe.is_remote_pf := in_msg.is_remote_pf;
 
@@ -842,14 +879,18 @@ bool upstreamHasShared(State state) {
          (state == State:UC_RSC) || (state == State:SC_RSC);
 }
 
-void printTBEState(TBE tbe) {
-  DPRINTF(RubySlicc, "STATE: addr: %#x data present=%d valid=%d unique=%d dirty=%d mu_dirty=%d dir ownerV=%d ownerE=%d sharers=%d tobe_I=%d tobe_SC=%d doFill=%d pendAction=%s txnId=%d\n",
-                      tbe.addr, tbe.dataBlkValid.isFull(), tbe.dataValid, tbe.dataUnique,
-                      tbe.dataDirty, tbe.dataMaybeDirtyUpstream, tbe.dir_ownerExists,
-                      tbe.dir_ownerIsExcl,tbe.dir_sharers.count(),
-                      tbe.dataToBeInvalid, tbe.dataToBeSharedClean,
-                      tbe.doCacheFill, tbe.pendAction, tbe.txnId);
-  DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid);
+void printTBEState(std::string pos, TBE tbe) {
+  if (is_valid(tbe)) {
+    DPRINTF(RubySlicc, "At %s: STATE: addr: %#x data present=%d valid=%d unique=%d dirty=%d mu_dirty=%d dir ownerV=%d ownerE=%d sharers=%d tobe_I=%d tobe_SC=%d doFill=%d pendAction=%s txnId=%d\n",
+                        pos, tbe.addr, tbe.dataBlkValid.isFull(), tbe.dataValid, tbe.dataUnique,
+                        tbe.dataDirty, tbe.dataMaybeDirtyUpstream, tbe.dir_ownerExists,
+                        tbe.dir_ownerIsExcl,tbe.dir_sharers.count(),
+                        tbe.dataToBeInvalid, tbe.dataToBeSharedClean,
+                        tbe.doCacheFill, tbe.pendAction, tbe.txnId);
+  } else {
+    DPRINTF(RubySlicc, "At %s: invalid tbe\n", pos);
+  }
+  // DPRINTF(RubySlicc, "dataBlkValid = %s\n", tbe.dataBlkValid);
 }
 
 void printDvmTBEState(TBE tbe) {
@@ -945,7 +986,7 @@ void copyCacheAndDir(CacheEntry cache_entry, DirEntry dir_entry,
   tbe.dataToBeSharedClean := false;
   tbe.dataToBeInvalid := false;
 
-  printTBEState(tbe);
+  printTBEState("copyCacheAndDir", tbe);
 }
 
 void copyCacheAndDirTBEs(TBE src, TBE dst) {
@@ -961,7 +1002,7 @@ void copyCacheAndDirTBEs(TBE src, TBE dst) {
   dst.dir_owner := src.dir_owner;
   dst.dir_ownerExists := src.dir_ownerExists;
   dst.dir_ownerIsExcl := src.dir_ownerIsExcl;
-  printTBEState(dst);
+  printTBEState("copyCacheAndDirTBEs", dst);
 }
 
 void deallocateReqTBE(TBE tbe) {
@@ -1093,7 +1134,7 @@ State makeFinalStateHelper(State cs, State ds) {
 
 State makeFinalState(TBE tbe, CacheEntry cache_entry, DirEntry dir_entry) {
   setDataToBeStates(tbe);
-  printTBEState(tbe);
+  printTBEState("makeFinalState", tbe);
 
   State cache_state := State:I;
   State dir_state := State:I;
@@ -1207,6 +1248,14 @@ bool isWriteReqType(CHIRequestType type) {
   return false;
 }
 
+bool isAtomicReqType(CHIRequestType type) {
+  if (type == CHIRequestType:AtomicLoad ||
+      type == CHIRequestType:AtomicStore) {
+    return true;
+  }
+  return false;
+}
+
 bool isStashReqType(CHIRequestType type) {
   if (type == CHIRequestType:StashOnceShared ||
       type == CHIRequestType:StashOnceUnique) {
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-ports.sm b/src/mem/ruby/protocol/chi/CHI-cache-ports.sm
index 8bb76fdb25..c4e84a4189 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-ports.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-ports.sm
@@ -318,6 +318,9 @@ in_port(reqRdyPort, CHIRequestMsg, reqRdy, rank=3,
       } else {
         CacheEntry cache_entry := getCacheEntry(in_msg.addr);
         TBE tbe := getCurrentActiveTBE(in_msg.addr);
+        if (is_valid(tbe)) {
+          tbe.is_from_cpu := in_msg.is_from_cpu;
+        }
 
         DirEntry dir_entry := getDirEntry(in_msg.addr);
 
@@ -421,6 +424,9 @@ in_port(seqInPort, RubyRequest, mandatoryQueue, rank=1) {
           );
         }
       } else {
+        if (!storTBEs.areNSlotsAvailable(1) || !reqRdy.areNSlotsAvailable(1, curTick())) {
+          seqInPort.notifyMissCallback(clockEdge(), sequencer);
+        }
         trigger(Event:AllocSeqRequest, in_msg.LineAddress,
                 getCacheEntry(in_msg.LineAddress),
                 getCurrentActiveTBE(in_msg.LineAddress));
@@ -482,7 +488,7 @@ void processNextState(Addr address, TBE tbe, CacheEntry cache_entry) {
     }
   }
 
-  printTBEState(tbe);
+  printTBEState("processNextState", tbe);
 
   // we might be going to BUSY_INTERRUPTABLE so wakeup pending snoops
   // if any
diff --git a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
index 8d531e93c0..03bae6bdde 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache-transitions.sm
@@ -968,6 +968,7 @@ transition({BUSY_BLKD,BUSY_INTR},
 }
 
 transition({BUSY_BLKD,BUSY_INTR}, Load) {
+  Callback_PrefetchLate;
   Callback_DelayedL1DResult_Load;
   StallRequest;
 }
diff --git a/src/mem/ruby/protocol/chi/CHI-cache.sm b/src/mem/ruby/protocol/chi/CHI-cache.sm
index fce153f35e..a10ca4e5b5 100644
--- a/src/mem/ruby/protocol/chi/CHI-cache.sm
+++ b/src/mem/ruby/protocol/chi/CHI-cache.sm
@@ -167,7 +167,7 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   // If the responder has the line in UC or UD state, propagate this state
   // on a ReadShared. Notice data won't be deallocated if dealloc_on_unique is
   // set
-  bool fwd_unique_on_readshared := "False";
+  bool fwd_unique_on_readshared := "True";
 
   // Allow receiving data in SD state.
   bool allow_SD;
@@ -584,8 +584,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
   structure(CacheEntry, interface="AbstractCacheEntry") {
     State state,        desc="SLICC line state";
     DataBlock DataBlk,  desc="data for the block";
+    bool everPrefetched, default="false", desc="Set if this cache entry was ever prefetched";
     bool HWPrefetched,  default="false", desc="Set if this cache entry was prefetched";
     RequestorID requestor,  desc="First requestor to fill this block";
+    XsPFMetaData pfmeta,  desc="Xs prefetch meta data";
   }
 
   // Directory entry
@@ -665,8 +667,10 @@ machine(MachineType:Cache, "Cache coherency protocol") :
 
     // if either is set prefetchers are not notified on miss/hit/fill and
     // demand hit/miss stats are not incremented
+    bool is_from_cpu,       default="false", dest="Request generated by CPU";
     bool is_local_pf,       desc="Request generated by a local prefetcher";
     bool is_remote_pf,      desc="Request generated a prefetcher in another cache";
+    XsPFMetaData pfmeta,    desc="Request generated by a local prefetcher, prefetch metadata";
 
     // Atomic info associated with the transaction
     WriteMask atomic_op,    desc="Atomic Operation Wrapper";
diff --git a/src/mem/ruby/protocol/chi/CHI-msg.sm b/src/mem/ruby/protocol/chi/CHI-msg.sm
index ce0d3d88cf..5369e5a8f7 100644
--- a/src/mem/ruby/protocol/chi/CHI-msg.sm
+++ b/src/mem/ruby/protocol/chi/CHI-msg.sm
@@ -115,8 +115,10 @@ structure(CHIRequestMsg, desc="", interface="Message") {
   RequestPtr seqReq,        default="nullptr", desc="Pointer to original request from CPU/sequencer (nullptr if not valid)";
   bool isSeqReqValid,       default="false",   desc="Set if seqReq is valid (not nullptr)";
 
+  bool is_from_cpu,         default="false", dest="Request generated by CPU";
   bool is_local_pf,         desc="Request generated by a local prefetcher";
   bool is_remote_pf,        desc="Request generated a prefetcher in another cache";
+  XsPFMetaData pfmeta,      desc="Request generated by a local prefetcher, prefetch meta";
 
   WriteMask atomic_op,      desc="Atomic Operation Wrapper";
 
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 0049c9d90c..63b11d61cc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ namespace ruby
 class Network;
 class GPUCoalescer;
 class DMASequencer;
+class AbstractCacheEntry;
 
 // used to communicate that an in_port peeked the wrong message type
 class RejectException: public std::exception
@@ -358,19 +359,30 @@ class AbstractController : public ClockedObject, public Consumer, public HasDown
     virtual bool inCache(const Addr &addr, const bool &is_secure)
     { fatal("inCache: prefetching not supported"); return false; }
 
+    virtual int level()
+    { fatal("level: prefetching not supported"); return 0; }
+
     virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure)
     { fatal("hasBeenPrefetched: prefetching not supported"); return false; }
 
-    virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure,
-                                   const RequestorID &requestor)
+    virtual bool hasBeenPrefetched(const Addr &addr, const bool &is_secure, const RequestorID &requestor)
     { fatal("hasBeenPrefetched: prefetching not supported"); return false; }
 
+    virtual bool hasEverBeenPrefetched(const Addr &addr, const bool &is_secure)
+    { fatal("hasEverBeenPrefetched: prefetching not supported"); return false; }
+
+    virtual Request::XsMetadata getHitBlkXsMetadata(const Addr &addr, const bool &is_secure)
+    { fatal("getHitBlkXsMetadata: prefetching not supported"); return Request::XsMetadata(); }
+
     virtual bool inMissQueue(const Addr &addr, const bool &is_secure)
     { fatal("inMissQueue: prefetching not supported"); return false; }
 
     virtual bool coalesce()
     { fatal("coalesce: prefetching not supported"); return false; }
 
+    virtual AbstractCacheEntry* findBlock(const Addr &addr, const bool &is_secure)
+    { fatal("findBlock: prefetching not supported"); return nullptr; }
+
     friend class RubyPrefetcherProxy;
 
   protected:
diff --git a/src/mem/ruby/slicc_interface/Controller.py b/src/mem/ruby/slicc_interface/Controller.py
index ef8a0afbf1..ead3327617 100644
--- a/src/mem/ruby/slicc_interface/Controller.py
+++ b/src/mem/ruby/slicc_interface/Controller.py
@@ -54,7 +54,7 @@ class RubyController(ClockedObject):
     cluster_id = Param.UInt32(0, "Id of this controller's cluster")
 
     transitions_per_cycle = Param.Int(
-        32, "no. of  SLICC state machine transitions per cycle"
+        1024, "no. of  SLICC state machine transitions per cycle"
     )
     buffer_size = Param.UInt32(0, "max buffer size 0 means infinite")
 
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index d0de3b1f13..0e90aa9ef8 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -51,6 +51,7 @@
 #include "mem/ruby/protocol/PrefetchBit.hh"
 #include "mem/ruby/protocol/RubyAccessMode.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
+#include "mem/ruby/slicc_interface/XsPFMetaData.hh"
 
 namespace gem5
 {
@@ -225,6 +226,10 @@ class RubyRequest : public Message
     const int& getSize() const { return m_Size; }
     const PrefetchBit& getPrefetch() const { return m_Prefetch; }
     RequestPtr getRequestPtr() const { return m_pkt->req; }
+    XsPFMetaData getXsPFMeta() const
+    {
+        return m_pkt->req->getXsMetadata();
+    }
 
     void setWriteMask(uint32_t offset, uint32_t len,
         std::vector< std::pair<int,AtomicOpFunctor*>> atomicOps);
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
index 8df56c7013..5647270405 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_Util.hh
@@ -58,6 +58,7 @@
 #include "mem/ruby/common/TypeDefines.hh"
 #include "mem/ruby/common/WriteMask.hh"
 #include "mem/ruby/protocol/RubyRequestType.hh"
+#include "mem/ruby/slicc_interface/XsPFMetaData.hh"
 
 namespace gem5
 {
@@ -65,6 +66,8 @@ namespace gem5
 namespace ruby
 {
 
+class AbstractCacheEntry;
+
 inline Cycles zero_time() { return Cycles(0); }
 
 inline Cycles intToCycles(int c) { return Cycles(c); }
@@ -322,6 +325,30 @@ getRequestorID(RequestPtr req)
     return req->requestorId();
 }
 
+inline XsPFMetaData
+getRequestXsMetaData(RequestPtr& req)
+{
+    return req->getXsMetadata();
+}
+
+inline void
+setRequestXsMetaData(RequestPtr& req, XsPFMetaData& pfmeta)
+{
+    req->setXsMetadata(pfmeta);
+}
+
+inline bool
+XsMetaIsNotNull(XsPFMetaData& pfmeta)
+{
+    return pfmeta.prefetchSource != PrefetchSourceType::PF_NONE;
+}
+
+inline AbstractCacheEntry*
+CacheEntryToAbstract(AbstractCacheEntry* entry)
+{
+    return entry;
+}
+
 } // namespace ruby
 } // namespace gem5
 
diff --git a/src/mem/ruby/slicc_interface/XsPFMetaData.hh b/src/mem/ruby/slicc_interface/XsPFMetaData.hh
new file mode 100644
index 0000000000..4c0f9d391d
--- /dev/null
+++ b/src/mem/ruby/slicc_interface/XsPFMetaData.hh
@@ -0,0 +1,19 @@
+#ifndef __MEM_RUBY_SLICC_INTERFACE_XSPFMETADATA_HH__
+#define __MEM_RUBY_SLICC_INTERFACE_XSPFMETADATA_HH__
+#include "mem/request.hh"
+
+namespace gem5
+{
+
+using XsPFMetaData = Request::XsMetadata;
+
+inline
+std::ostream& operator<<(std::ostream& os, const XsPFMetaData& meta)
+{
+    os << "pfsource[" << meta.prefetchSource << "]";
+    return os;
+}
+
+}
+
+#endif
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index 90d67fb29b..b98d1c8316 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -77,6 +77,7 @@ CacheMemory::CacheMemory(const Params &p)
              p.ruby_system->clockPeriod()),
     cacheMemoryStats(this)
 {
+    m_level = p.level;
     m_cache_size = p.size;
     m_cache_assoc = p.assoc;
     m_replacementPolicy_ptr = p.replacement_policy;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 58e8722a00..761aa6ad5e 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -155,6 +155,7 @@ class CacheMemory : public SimObject
     void htmCommitTransaction();
 
   public:
+    int level() const { return m_level; }
     int getCacheSize() const { return m_cache_size; }
     int getCacheAssoc() const { return m_cache_assoc; }
     int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
@@ -189,6 +190,7 @@ class CacheMemory : public SimObject
     BankedArray tagArray;
     ALUFreeListArray atomicALUArray;
 
+    int m_level;
     int m_cache_size;
     int m_cache_num_sets;
     int m_cache_num_set_bits;
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 2f457f5c4a..eb181f4609 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -35,6 +35,7 @@ class RubyCache(SimObject):
     cxx_class = "gem5::ruby::CacheMemory"
     cxx_header = "mem/ruby/structures/CacheMemory.hh"
 
+    level = Param.Int(-1, "the level of cache (l1 is 1, l2 is 2, etc.)")
     size = Param.MemorySize("capacity in bytes")
     assoc = Param.Int("")
     replacement_policy = Param.BaseReplacementPolicy(TreePLRURP(), "")
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.cc b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
index 80844da0d7..7e688d8066 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.cc
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.cc
@@ -38,6 +38,7 @@
 #include "mem/ruby/structures/RubyPrefetcherProxy.hh"
 
 #include "debug/HWPrefetch.hh"
+#include "mem/ruby/slicc_interface/RubySlicc_Util.hh"
 #include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
@@ -50,6 +51,7 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent,
                           prefetch::Base* _prefetcher,
                           MessageBuffer *_pf_queue)
     :Named(_parent->name()),
+    stat(_prefetcher),
     prefetcher(_prefetcher),
     cacheCntrl(_parent),
     pfQueue(_pf_queue),
@@ -63,16 +65,22 @@ RubyPrefetcherProxy::RubyPrefetcherProxy(AbstractController* _parent,
         fatal_if(!pfQueue,
             "%s initializing a RubyPrefetcherProxy without a prefetch queue",
             name());
-        // prefetcher->setParentInfo(
-        //     cacheCntrl->params().system,
-        //     cacheCntrl->getProbeManager(),
-        //     RubySystem::getBlockSizeBytes());
-        // Cannot do this below: RPP is not SimObject
-        // block size is same as system!
-        prefetcher->setPMInfoDirty(cacheCntrl->getProbeManager());
+        prefetcher->setParentInfo(
+            cacheCntrl->params().system,
+            cacheCntrl->getProbeManager(),
+            this,
+            RubySystem::getBlockSizeBytes());
     }
 }
 
+RubyPrefetcherProxy::PfProxyStat::PfProxyStat(statistics::Group *parent)
+    : statistics::Group(parent, "RubyPrefetcherProxy"),
+      ADD_STAT(notifymiss,""),
+      ADD_STAT(notifyhit,""),
+      ADD_STAT(issuedPf,"")
+{
+}
+
 void
 RubyPrefetcherProxy::scheduleNextPrefetch()
 {
@@ -124,9 +132,7 @@ RubyPrefetcherProxy::issuePrefetch()
                                     pkt->getAddr(), line_addr,
                                     pkt->needsWritable());
 
-                RubyRequestType req_type = pkt->needsWritable() ?
-                                    RubyRequestType_ST : RubyRequestType_LD;
-
+                RubyRequestType req_type = RubyRequestType_LD;
                 std::shared_ptr<RubyRequest> msg =
                     std::make_shared<RubyRequest>(cacheCntrl->clockEdge(),
                                                   pkt->getAddr(),
@@ -136,10 +142,11 @@ RubyPrefetcherProxy::issuePrefetch()
                                                   RubyAccessMode_Supervisor,
                                                   pkt,
                                                   PrefetchBit_Yes);
-
+                assert(msg->getRequestPtr()->hasXsMetadata());
                 // enqueue request into prefetch queue to the cache
                 pfQueue->enqueue(msg, cacheCntrl->clockEdge(),
                                     cacheCntrl->cyclesToTicks(Cycles(1)));
+                stat.issuedPf++;
 
                 // track all pending PF requests
                 issuedPfPkts[line_addr] = pkt;
@@ -157,33 +164,51 @@ RubyPrefetcherProxy::issuePrefetch()
 }
 
 void
-RubyPrefetcherProxy::notifyPfHit(const RequestPtr& req, bool is_read,
+RubyPrefetcherProxy::notifyPfHit(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta,
                                 const DataBlock& data_blk)
 {
     assert(ppHit);
     assert(req);
+    stat.notifyhit++;
     Packet pkt(req, is_read ? Packet::makeReadCmd(req) :
                               Packet::makeWriteCmd(req));
     // NOTE: for now we only communicate physical address with prefetchers
     pkt.dataStaticConst<uint8_t>(data_blk.getData(getOffset(req->getPaddr()),
                                   pkt.getSize()));
     DPRINTF(HWPrefetch, "notify hit: %s\n", pkt.print());
+    pkt.missOnLatePf = false;
+    pkt.pfSource = pfmeta.prefetchSource;
+    pkt.pfDepth = pfmeta.prefetchDepth;
     ppHit->notify(&pkt);
     scheduleNextPrefetch();
 }
 
 void
-RubyPrefetcherProxy::notifyPfMiss(const RequestPtr& req, bool is_read,
+RubyPrefetcherProxy::notifyPfMiss(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta,
                                  const DataBlock& data_blk)
 {
     assert(ppMiss);
     assert(req);
-    Packet pkt(req, is_read ? Packet::makeReadCmd(req) :
-                              Packet::makeWriteCmd(req));
+    stat.notifymiss++;
+    Packet pkt(req, is_read ? MemCmd::ReadReq : MemCmd::WriteReq);
     // NOTE: for now we only communicate physical address with prefetchers
     pkt.dataStaticConst<uint8_t>(data_blk.getData(getOffset(req->getPaddr()),
                                   pkt.getSize()));
     DPRINTF(HWPrefetch, "notify miss: %s\n", pkt.print());
+    pkt.missOnLatePf = (pfmeta.prefetchSource != PrefetchSourceType::PF_NONE);
+    pkt.pfSource = pfmeta.prefetchSource;
+    pkt.pfDepth = pfmeta.prefetchDepth;
+    if (!pkt.missOnLatePf && issuedPfPkts.count(makeLineAddress(req->getPaddr())) > 0)
+    {
+        auto pfpkt = issuedPfPkts[makeLineAddress(req->getPaddr())];
+        pkt.missOnLatePf = true;
+        pkt.pfSource = pfpkt->req->getXsMetadata().prefetchSource;
+        pkt.pfDepth = pfpkt->req->getXsMetadata().prefetchDepth;
+    }
+    if (XsMetaIsNotNull(pfmeta)) {
+        prefetcher->pfHitInMSHR(pfmeta.prefetchSource);
+    }
+    prefetcher->incrDemandMhsrMisses();
     ppMiss->notify(&pkt);
     scheduleNextPrefetch();
 }
@@ -207,19 +232,42 @@ RubyPrefetcherProxy::notifyPfFill(const RequestPtr& req,
 }
 
 void
-RubyPrefetcherProxy::notifyPfEvict(Addr blkAddr, bool hwPrefetched,
+RubyPrefetcherProxy::notifyPfEvict(Addr blkAddr, bool hwPrefetched, XsPFMetaData& pfmeta,
                                   RequestorID requestorID)
 {
     DPRINTF(HWPrefetch, "notify evict: %#x hw_pf=%d\n", blkAddr, hwPrefetched);
     // DataUpdate data_update(
     //     blkAddr, false, requestorID, *this);
     // Maybe using the old DataUpdate here is enough
+    if (hwPrefetched) {
+        prefetcher->prefetchUnused(pfmeta.prefetchSource);
+    }
     DataUpdate data_update(blkAddr, false);
     // data_update.hwPrefetched = hwPrefetched;
     ppDataUpdate->notify(data_update);
     scheduleNextPrefetch();
 }
 
+void
+RubyPrefetcherProxy::pfHitInCache(const XsPFMetaData& pfmeta)
+{
+    prefetcher->pfHitInCache(pfmeta.prefetchSource);
+}
+
+void
+RubyPrefetcherProxy::notifyHitToDownStream(const RequestPtr& req)
+{
+    // TODO:
+}
+
+void
+RubyPrefetcherProxy::offloadToDownStream()
+{
+    if (prefetcher->hasHintDownStream()) {
+        prefetcher->offloadToDownStream();
+    }
+}
+
 void
 RubyPrefetcherProxy::regProbePoints()
 {
diff --git a/src/mem/ruby/structures/RubyPrefetcherProxy.hh b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
index 79c311ba4d..47dce0b3d4 100644
--- a/src/mem/ruby/structures/RubyPrefetcherProxy.hh
+++ b/src/mem/ruby/structures/RubyPrefetcherProxy.hh
@@ -43,8 +43,11 @@
 // #include "mem/cache/cache_probe_arg.hh"
 #include "mem/cache/base.hh"
 #include "mem/cache/prefetch/base.hh"
+#include "mem/ruby/slicc_interface/AbstractCacheEntry.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
 #include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/slicc_interface/XsPFMetaData.hh"
+#include "mem/ruby/system/RubySystem.hh"
 
 namespace gem5
 {
@@ -54,7 +57,7 @@ namespace ruby
 // Removed cache accessor
   using DataUpdate = BaseCache::DataUpdate;
 
-class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named
+class RubyPrefetcherProxy : public CacheAccessor, public Named
 {
   public:
 
@@ -71,20 +74,37 @@ class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named
     /**
      * Notify PF probes hit/miss/fill
      */
-    void notifyPfHit(const RequestPtr& req, bool is_read,
+    void notifyPfHit(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta,
                      const DataBlock& data_blk);
-    void notifyPfMiss(const RequestPtr& req, bool is_read,
+
+    void notifyPfMiss(const RequestPtr& req, bool is_read, XsPFMetaData& pfmeta,
                       const DataBlock& data_blk);
+
     void notifyPfFill(const RequestPtr& req, const DataBlock& data_blk,
                       bool from_pf);
-    void notifyPfEvict(Addr blkAddr, bool hwPrefetched,
+    void notifyPfEvict(Addr blkAddr, bool hwPrefetched, XsPFMetaData& pfmeta,
                        RequestorID requestorID);
 
+    void pfHitInCache(const XsPFMetaData& pfmeta);
+
+    void notifyHitToDownStream(const RequestPtr& req);
+
+    void offloadToDownStream();
+
     /** Registers probes. */
     void regProbePoints();
 
   private:
 
+    struct PfProxyStat : public statistics::Group
+    {
+      PfProxyStat(statistics::Group *parent);
+      statistics::Scalar notifymiss;
+      statistics::Scalar notifyhit;
+      statistics::Scalar issuedPf;
+    } stat;
+
+
     /** Schedule the next ready prefetch */
     void scheduleNextPrefetch();
 
@@ -129,29 +149,49 @@ class RubyPrefetcherProxy : /*public CacheAccessor,*/ public Named
 
     /** Accessor functions */
 
-    // bool inCache(Addr addr, bool is_secure) const override
-    // {
-    //     return cacheCntrl->inCache(addr, is_secure);
-    // }
-
-    // bool hasBeenPrefetched(Addr addr, bool is_secure) const override
-    // {
-    //     return cacheCntrl->hasBeenPrefetched(addr, is_secure);
-    // }
-
-    // bool hasBeenPrefetched(Addr addr, bool is_secure,
-    //                         RequestorID requestor) const override
-    // {
-    //     return cacheCntrl->hasBeenPrefetched(addr, is_secure, requestor);
-    // }
-
-    // bool inMissQueue(Addr addr, bool is_secure) const override
-    // {
-    //     return cacheCntrl->inMissQueue(addr, is_secure);
-    // }
-
-    // bool coalesce() const override
-    // { return cacheCntrl->coalesce(); }
+    bool inCache(Addr addr, bool is_secure) const override
+    {
+        return cacheCntrl->inCache(addr, is_secure);
+    }
+
+    virtual unsigned level() const override
+    {
+        return cacheCntrl->level();
+    }
+
+    bool hasBeenPrefetched(Addr addr, bool is_secure) const override
+    {
+        return cacheCntrl->hasBeenPrefetched(addr, is_secure);
+    }
+
+    bool hasBeenPrefetched(Addr addr, bool is_secure,
+                            RequestorID requestor) const override
+    {
+        return cacheCntrl->hasBeenPrefetched(addr, is_secure, requestor);
+    }
+
+    bool hasEverBeenPrefetched(Addr addr, bool is_secure) const override
+    {
+        return cacheCntrl->hasEverBeenPrefetched(addr, is_secure);
+    }
+
+    Request::XsMetadata getHitBlkXsMetadata(PacketPtr pkt) override
+    {
+        return cacheCntrl->getHitBlkXsMetadata(pkt->getAddr(), pkt->isSecure());
+    }
+
+    bool inMissQueue(Addr addr, bool is_secure) const override
+    {
+        return cacheCntrl->inMissQueue(addr, is_secure);
+    }
+
+    bool coalesce() const override
+    { return cacheCntrl->coalesce(); }
+
+    const uint8_t* findBlock(Addr addr, bool is_secure) const override
+    {
+      return cacheCntrl->findBlock(addr, is_secure)->getDataBlk().getData(0, RubySystem::getBlockSizeBytes());
+    }
 
 };
 
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 169eb9a21c..dffb5bb34d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -67,8 +67,17 @@ namespace gem5
 namespace ruby
 {
 
+Sequencer::SequencerStat::SequencerStat(statistics::Group *parent)
+    : statistics::Group(parent, "Sequencer"),
+      ADD_STAT(notifymiss,""),
+      ADD_STAT(loadcancel,"")
+{
+}
+
 Sequencer::Sequencer(const Params &p)
-    : RubyPort(p), m_IncompleteTimes(MachineType_NUM),
+    : RubyPort(p),
+      stat(this),
+      m_IncompleteTimes(MachineType_NUM),
       deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check")
 {
     m_outstanding_count = 0;
@@ -376,6 +385,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType primary_type,
                 DPRINTF(RubySequencer, "Pkt %#lx %s is delayed because blk is busy doing ruby stuff\n",
                     pkt, pkt->cmdString());
                 ruby_custom_signal_callback(pkt);
+                stat.loadcancel++;
             }
         }
         return RequestStatus_Aliased;
@@ -629,7 +639,7 @@ void
 Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy)
 {
     assert(address == makeLineAddress(address));
-
+    stat.notifymiss++;
     auto it = m_RequestTable.find(address);
     assert(it != m_RequestTable.end());
 
@@ -639,16 +649,40 @@ Sequencer::notifyMissCallback(Addr address, bool is_upgrade, bool is_busy)
     for (auto &seq_req: seq_req_list) {
         if (seq_req.pkt->isRead()) {
             ruby_custom_signal_callback(seq_req.pkt);
+            stat.loadcancel++;
         }
     }
 
     m_BusyBlocks.insert(address);
 
-    DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads",
+    DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads\n",
         is_upgrade ? "load" : "store", address);
     return;
 }
 
+void
+Sequencer::TBEFullCancel(Addr address)
+{
+    assert(address == makeLineAddress(address));
+
+    auto it = m_RequestTable.find(address);
+    assert(it != m_RequestTable.end());
+
+    auto &seq_req_list = it->second;
+
+    // cancel pending loads' speculation
+    for (auto &seq_req: seq_req_list) {
+        if (seq_req.pkt->isRead()) {
+            ruby_custom_signal_callback(seq_req.pkt);
+            stat.loadcancel++;
+        }
+    }
+
+    DPRINTF(RubySequencer, "A %s of addr %#x signals the delay of all pending loads\n",
+        false ? "load" : "store", address);
+    return;
+}
+
 void
 Sequencer::atomicCallback(Addr address, DataBlock& data,
                          const bool externalHit, const MachineType mach,
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 85a2a1597a..0ed88d247d 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -129,6 +129,8 @@ class Sequencer : public RubyPort
 
     void notifyMissCallback(Addr address, bool is_upgrade, bool is_snoop);
 
+    void TBEFullCancel(Addr address);
+
     void atomicCallback(Addr address,
                         DataBlock& data,
                         const bool externalHit = false,
@@ -250,6 +252,14 @@ class Sequencer : public RubyPort
                                         RubyRequestType secondary_type);
 
   private:
+
+    struct SequencerStat : public statistics::Group
+    {
+      SequencerStat(statistics::Group *parent);
+      statistics::Scalar notifymiss;
+      statistics::Scalar loadcancel;
+    } stat;
+
     int m_max_outstanding_requests;
 
     int m_num_pending_invs;
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 3f570fb952..e3a13c87f8 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -104,7 +104,7 @@ class RubySequencer(RubyPort):
     dcache = Param.RubyCache("")
 
     max_outstanding_requests = Param.Int(
-        16, "max requests (incl. prefetches) outstanding"
+        1024, "max requests (incl. prefetches) outstanding"
     )
     deadlock_threshold = Param.Cycles(
         500000,