Skip to content

Add DWC2 cache maintenance routines for STM32 #2963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from

Conversation

HiFiPhile
Copy link
Collaborator

@HiFiPhile HiFiPhile commented Jan 25, 2025

Describe the PR

  • Support DMA + DCache ON
  • Refactored buffer alignment macro to take into account cache line size

Now #define CFG_TUD_DWC2_DMA_ENABLE 1 is enough.

It's prefer to declare a non-cached region with MPU instead of rely on cache invalidate+clean, benchmark on STM32H7S3 and i.MX RT1170 shows frequent cache invalidate+clean really hurts performance.

** Need rebase after #2960

Benchmark code
extern uint32_t SystemCoreClock;
void SWD_Init(void)
{
  //UNLOCK FUNNEL
  *(volatile uint32_t*)(0x5C004FB0) = 0xC5ACCE55; // SWTF_LAR
  *(volatile uint32_t*)(0x5C003FB0) = 0xC5ACCE55; // SWO_LAR
 
  //SWO current output divisor register
  //This divisor value (0x000000C7) corresponds to 400Mhz
  //To change it, you can use the following rule
  // value = (CPU Freq/sw speed )-1
  blink_interval_ms= *(volatile uint32_t*)(0x5C003010);
   *(volatile uint32_t*)(0x5C003010) = ((SystemCoreClock / 12000000) - 1); // SWO_CODR
 
  //SWO selected pin protocol register
   *(volatile uint32_t*)(0x5C0030F0) = 0x00000002; // SWO_SPPR
 
  //Enable ITM input of SWO trace funnel
   *(volatile uint32_t*)(0x5C004000) |= 0x00000001; // SWFT_CTRL
 
}

static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit);
static void MPU_Config(void)
{
  MPU_Region_InitTypeDef MPU_InitStruct = {0};
  uint32_t index = MPU_REGION_NUMBER0;
  uint32_t address;
  uint32_t size;

  /* Disable the MPU */
  HAL_MPU_Disable();

  /* Initialize the background region */
  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.BaseAddress = 0x0;
  MPU_InitStruct.Size = MPU_REGION_SIZE_4GB;
  MPU_InitStruct.SubRegionDisable = 0x87;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL0;
  MPU_InitStruct.AccessPermission = MPU_REGION_NO_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Initialize the non cacheable region */
#if defined ( __ICCARM__ )
  /* get the region attribute form the icf file */
  extern uint32_t NONCACHEABLEBUFFER_start;
  extern uint32_t NONCACHEABLEBUFFER_size;

  address = (uint32_t)&NONCACHEABLEBUFFER_start;
  size = (uint32_t)&NONCACHEABLEBUFFER_size;

#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Base;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$Length;
  extern uint32_t Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;

  address = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Base;
  size  = (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$Length + (uint32_t)&Image$$RW_NONCACHEABLEBUFFER$$ZI$$Length;
#elif defined ( __GNUC__ )
  extern int __NONCACHEABLEBUFFER_BEGIN;
  extern int __NONCACHEABLEBUFFER_END;

  address = (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
  size  = (uint32_t)&__NONCACHEABLEBUFFER_END - (uint32_t)&__NONCACHEABLEBUFFER_BEGIN;
#else
#error "Compiler toolchain is unsupported"
#endif

  if (size != 0)
  {
    /* Configure the MPU attributes as Normal Non Cacheable */
    MPU_InitStruct.Enable = MPU_REGION_ENABLE;
    MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
    MPU_InitStruct.IsBufferable = MPU_ACCESS_NOT_BUFFERABLE;
    MPU_InitStruct.IsCacheable = MPU_ACCESS_NOT_CACHEABLE;
    MPU_InitStruct.IsShareable = MPU_ACCESS_NOT_SHAREABLE;
    MPU_InitStruct.Number = index;
    MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
    MPU_InitStruct.SubRegionDisable = 0x00;
    MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_DISABLE;
    MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
    index++;
  }

  /* Initialize the region corresponding to the execution area
     (external or internal flash or external or internal RAM
     depending on scatter file definition) */
#if defined ( __ICCARM__ )
  extern uint32_t __ICFEDIT_region_ROM_start__;
  extern uint32_t __ICFEDIT_region_ROM_end__;
  address = (uint32_t)&__ICFEDIT_region_ROM_start__;
  size = (uint32_t)&__ICFEDIT_region_ROM_end__ - (uint32_t)&__ICFEDIT_region_ROM_start__ + 1;
#elif defined (__CC_ARM) || defined(__ARMCC_VERSION)
  extern uint32_t Image$$ER_ROM$$Base;
  extern uint32_t Image$$ER_ROM$$Limit;
  address = (uint32_t)&Image$$ER_ROM$$Base;
  size    = (uint32_t)&Image$$ER_ROM$$Limit-(uint32_t)&Image$$ER_ROM$$Base;
#elif defined ( __GNUC__ )
  extern uint32_t __FLASH_BEGIN;
  extern uint32_t __FLASH_SIZE;
  address = (uint32_t)&__FLASH_BEGIN;
  size  = (uint32_t)&__FLASH_SIZE;
#else
#error "Compiler toolchain is unsupported"
#endif

  MPU_InitStruct.Enable = MPU_REGION_ENABLE;
  MPU_InitStruct.Number = index;
  MPU_InitStruct.SubRegionDisable = 0u;
  MPU_InitStruct.TypeExtField = MPU_TEX_LEVEL1;
  MPU_InitStruct.AccessPermission = MPU_REGION_FULL_ACCESS;
  MPU_InitStruct.DisableExec = MPU_INSTRUCTION_ACCESS_ENABLE;
  MPU_InitStruct.IsShareable = MPU_ACCESS_SHAREABLE;
  MPU_InitStruct.IsCacheable = MPU_ACCESS_CACHEABLE;
  MPU_InitStruct.IsBufferable = MPU_ACCESS_BUFFERABLE;
  MPU_AdjustRegionAddressSize(address, size, &MPU_InitStruct);
  HAL_MPU_ConfigRegion(&MPU_InitStruct);
  index++;

  /* Reset unused MPU regions */
  for(; index < __MPU_REGIONCOUNT ; index++)
  {
    /* All unused regions disabled */
    MPU_InitStruct.Enable = MPU_REGION_DISABLE;
    MPU_InitStruct.Number = index;
    HAL_MPU_ConfigRegion(&MPU_InitStruct);
  }

  /* Enable the MPU */
  HAL_MPU_Enable(MPU_PRIVILEGED_DEFAULT);
}

/**
  * @brief This function adjusts the MPU region Address and Size within an MPU configuration.
  * @param Address memory address
  * @param Size memory size
  * @param pInit pointer to an MPU initialization structure
  * @retval None
  */
static void MPU_AdjustRegionAddressSize(uint32_t Address, uint32_t Size, MPU_Region_InitTypeDef* pInit)
{
  /* Compute the MPU region size */
  pInit->Size = ((31 - __CLZ(Size)) - 1);
  if (Size > (1 << (pInit->Size + 1)))
  {
    pInit->Size++;
  }
  uint32_t Modulo = Address % (1 << (pInit->Size - 1));
  if (0 != Modulo)
  {
    /* Align address with MPU region size considering there is no need to increase the size */
    pInit->BaseAddress = Address - Modulo;
  }
  else
  {
    pInit->BaseAddress = Address;
  }
}

#define TEST_SIZE 1024

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer1[TEST_SIZE];

__attribute__((section("dtcm_data")))
__attribute__((aligned(32)))
uint8_t buffer2[TEST_SIZE];

__attribute__((section("noncacheable")))
__attribute__((aligned(32)))
uint8_t buffer_ncache[TEST_SIZE];

__attribute__((aligned(32)))
uint8_t buffer_cached[TEST_SIZE];

unsigned int test_loop(void* dst, const void* src, int size, bool flush, bool invalidate)
{
    volatile unsigned int *DWT_CYCCNT = (uint32_t *)0xE0001004; //address of the register
    volatile unsigned int *DWT_CONTROL = (uint32_t *)0xE0001000; //address of the register
    volatile unsigned int *SCB_DEMCR = (uint32_t *)0xE000EDFC; //address of the register
    
    *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
    *DWT_CYCCNT = 0;
    *DWT_CONTROL |=  1;
    
    if(invalidate)
        SCB_InvalidateDCache_by_Addr((uint32_t*)src, size);
    
    
    memcpy(dst, src, size);
    

    if (flush)
        SCB_CleanDCache_by_Addr((uint32_t*)dst, size);
    
    *DWT_CONTROL &= ~1;
    return *DWT_CYCCNT;
}

int main(void) {
  MPU_Config();
  board_init();
  SWD_Init();
    
  unsigned int cycle;
    printf("\r\nmemcpy benchmark \r\n");

    for(int i = 0; i < TEST_SIZE; i++)
    {
        buffer1[i] = (uint8_t)i;
    }
    
    printf("DTCM - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer2, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - NonCache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_ncache, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("DTCM - Cache+Flush\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer_cached, buffer1, TEST_SIZE, true, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    printf("NonCache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_ncache, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, false);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    printf("Cache+Invalidate - DTCM\r\n");
    for(int i = 0; i < 8; i++)
    {
        cycle = test_loop(buffer1, buffer_cached, TEST_SIZE, false, true);
        printf("Loop:%d   Cycle:%d\r\n", i, cycle);
    }
    printf("Throughput:%d MB/s\r\n", TEST_SIZE * (SystemCoreClock / cycle) / 1048576);
    
    while (1)
    {
    }
  
}

Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
@HiFiPhile
Copy link
Collaborator Author

Looks like my HIL instance has license issue, I think we can add the env locally.

@pstadelmann
Copy link

pstadelmann commented Jan 27, 2025

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

@HiFiPhile
Copy link
Collaborator Author

Works great for me, both for CDC and UVC. Thanks !

In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test.
It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

@hathach
Copy link
Owner

hathach commented Jan 28, 2025

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

@pstadelmann
Copy link

Works great for me, both for CDC and UVC. Thanks !
In addition to #define CFG_TUD_DWC2_DMA_ENABLE 1 I also had to add #define __CORTEX_M 7

Thanks for your test. It's little strange that you need to define __CORTEX_M, normally stm32h7xx.h includes devcie header eg. stm32h747xx.h which includes core_cm7.h and __CORTEX_M is defined inside. Did I miss something ?

I don't think so. I'm using a custom RTOS which relies on its own set of headers, that's why.

@HiFiPhile
Copy link
Collaborator Author

Thanks @HiFiPhile for great Pr as usual. Though I am off for TET (Lunar New Year) and won't be able to review this in 2 weeks. Happy New Year 🎉

Happy new year also 🎊

Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Signed-off-by: HiFiPhile <[email protected]>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants