Skip to content

Commit

Permalink
Redesign free temp dir monitoring feature (#1394)
Browse files Browse the repository at this point in the history
* Redesign free temp dir monitoring feature

The previous version of this feature relied on a pre-defined limit for how much space needs to be available on the tmp directory. This posed a problem for non-homogeneous farms where different hosts can have great discrepancies on storage size.

This new implementation relies on a pre-defined minimum percentage of available temporary storage.
  • Loading branch information
DiegoTavares authored Jul 10, 2024
1 parent 7109fc8 commit efa22a4
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ public class HostReportHandler {
private static final String SUBJECT_COMMENT_FULL_TEMP_DIR = "Host set to REPAIR for not having enough storage " +
"space on the temporary directory (mcp)";
private static final String CUEBOT_COMMENT_USER = "cuebot";
private static final String WINDOWS_OS = "Windows";

// A cache <hostname_frameId, count> to store kill requests and count the number of occurrences.
// The cache expires after write to avoid growing unbounded. If a request for a host-frame doesn't appear
Expand Down Expand Up @@ -182,7 +183,11 @@ public void handleHostReport(HostReport report, boolean isBoot) {
rhost.getLoad(), new Timestamp(rhost.getBootTime() * 1000l),
rhost.getAttributesMap().get("SP_OS"));

changeHardwareState(host, report.getHost().getState(), isBoot, report.getHost().getFreeMcp());
// Both logics are conflicting, only change hardware state if
// there was no need for a tempDirStorage state change
if (!changeStateForTempDirStorage(host, report.getHost())) {
changeHardwareState(host, report.getHost().getState(), isBoot);
}
changeNimbyState(host, report.getHost());

/**
Expand Down Expand Up @@ -247,12 +252,10 @@ public void handleHostReport(HostReport report, boolean isBoot) {
}
}

// The minimum amount of free space in the temporary directory to book a host
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);

if (minBookableFreeTempDir != -1 && report.getHost().getFreeMcp() < minBookableFreeTempDir) {
msg = String.format("%s doens't have enough free space in the temporary directory (mcp), %dMB needs %dMB",
host.name, (report.getHost().getFreeMcp()/1024), (minBookableFreeTempDir/1024));
if (!isTempDirStorageEnough(report.getHost().getTotalMcp(), report.getHost().getFreeMcp(), host.os)) {
msg = String.format(
"%s doens't have enough free space in the temporary directory (mcp), %dMB",
host.name, (report.getHost().getFreeMcp()/1024));
}
else if (host.idleCores < Dispatcher.CORE_POINTS_RESERVED_MIN) {
msg = String.format("%s doesn't have enough idle cores, %d needs %d",
Expand Down Expand Up @@ -333,6 +336,27 @@ else if (!dispatchSupport.isCueBookable(host)) {
}
}

/**
* Check if a reported temp storage size and availability is enough for running a job
*
* Use dispatcher.min_available_temp_storage_percentage (opencue.properties) to
* define what's the accepted threshold. Providing hostOs is necessary as this feature
* is currently not available on Windows hosts
*
* @param tempTotalStorage Total storage on the temp directory
* @param tempFreeStorage Free storage on the temp directory
* @param hostOs Reported os
* @return
*/
private boolean isTempDirStorageEnough(Long tempTotalStorage, Long tempFreeStorage, String hostOs) {
// The minimum amount of free space in the temporary directory to book a host
int minAvailableTempPercentage = env.getRequiredProperty(
"dispatcher.min_available_temp_storage_percentage", Integer.class);

return minAvailableTempPercentage == -1 || hostOs.equalsIgnoreCase(WINDOWS_OS) ||
(((tempFreeStorage * 100.0) / tempTotalStorage) >= minAvailableTempPercentage);
}

/**
* Update the hardware state property.
*
Expand All @@ -342,62 +366,11 @@ else if (!dispatchSupport.isCueBookable(host)) {
* updated with a boot report. If the state is Repair, then state is
* never updated via RQD.
*
*
* Prevent cue frames from booking on hosts with full temporary directories.
*
* Change host state to REPAIR or UP according the amount of free space
* in the temporary directory:
* - Set the host state to REPAIR, when the amount of free space in the
* temporary directory is less than the minimum required. Add a comment with
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
* - Set the host state to UP, when the amount of free space in the temporary directory
* is greater or equals to the minimum required and the host has a comment with
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
*
* @param host
* @param reportState
* @param isBoot
* @param freeTempDir
*/
private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot, long freeTempDir) {

// The minimum amount of free space in the temporary directory to book a host
Long minBookableFreeTempDir = env.getRequiredProperty("dispatcher.min_bookable_free_temp_dir_kb", Long.class);

// Prevent cue frames from booking on hosts with full temporary directories
if (minBookableFreeTempDir != -1 && !host.os.equalsIgnoreCase("Windows")) {
if (host.hardwareState == HardwareState.UP && freeTempDir < minBookableFreeTempDir) {

// Insert a comment indicating that the Host status = Repair with reason = Full temporary directory
CommentDetail c = new CommentDetail();
c.subject = SUBJECT_COMMENT_FULL_TEMP_DIR;
c.user = CUEBOT_COMMENT_USER;
c.timestamp = null;
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in the " +
"temporary directory (mcp) is " + (freeTempDir/1024) + "MB. It must have at least "
+ (minBookableFreeTempDir/1024) + "MB of free space in temporary directory";
commentManager.addComment(host, c);

// Set the host state to REPAIR
hostManager.setHostState(host, HardwareState.REPAIR);
host.hardwareState = HardwareState.REPAIR;

return;
} else if (host.hardwareState == HardwareState.REPAIR && freeTempDir >= minBookableFreeTempDir) {
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
// user=CUEBOT_COMMENT_USER and delete the comments, if they exists
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host,
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_TEMP_DIR);

if (commentsDeleted) {
// Set the host state to UP
hostManager.setHostState(host, HardwareState.UP);
host.hardwareState = HardwareState.UP;
return;
}
}
}

private void changeHardwareState(DispatchHost host, HardwareState reportState, boolean isBoot) {
// If the states are the same there is no reason to do this update.
if (host.hardwareState.equals(reportState)) {
return;
Expand Down Expand Up @@ -427,6 +400,61 @@ private void changeHardwareState(DispatchHost host, HardwareState reportState, b
}
}

/**
* Prevent cue frames from booking on hosts with full temporary directories.
*
* Change host state to REPAIR or UP according to the amount of free space
* in the temporary directory:
* - Set the host state to REPAIR, when the amount of free space in the
* temporary directory is less than the minimum required.
* - Set the host state to UP, when the amount of free space in the temporary directory
* is greater or equal to the minimum required and the host has a comment with
* subject: SUBJECT_COMMENT_FULL_TEMP_DIR
*
* @param host
* @param reportHost
* @return
*/
private boolean changeStateForTempDirStorage(DispatchHost host, RenderHost reportHost) {
// The minimum amount of free space in the temporary directory to book a host
int minAvailableTempPercentage = env.getRequiredProperty(
"dispatcher.min_available_temp_storage_percentage", Integer.class);

// Prevent cue frames from booking on hosts with full temporary directories
boolean hasEnoughTempStorage = isTempDirStorageEnough(reportHost.getTotalMcp(), reportHost.getFreeMcp(), host.os);
if (!hasEnoughTempStorage && host.hardwareState == HardwareState.UP) {
// Insert a comment indicating that the Host status = Repair with reason = Full temporary directory
CommentDetail c = new CommentDetail();
c.subject = SUBJECT_COMMENT_FULL_TEMP_DIR;
c.user = CUEBOT_COMMENT_USER;
c.timestamp = null;
long requiredTempMb = (long)((minAvailableTempPercentage / 100.0) * reportHost.getTotalMcp()/ 1024);
c.message = "Host " + host.getName() + " marked as REPAIR. The current amount of free space in the " +
"temporary directory (mcp) is " + (reportHost.getFreeMcp()/1024) + "MB. It must have at least "
+ ((requiredTempMb)) + "MB of free space in temporary directory";
commentManager.addComment(host, c);

// Set the host state to REPAIR
hostManager.setHostState(host, HardwareState.REPAIR);
host.hardwareState = HardwareState.REPAIR;

return true;
} else if (hasEnoughTempStorage && host.hardwareState == HardwareState.REPAIR) {
// Check if the host with REPAIR status has comments with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
// user=CUEBOT_COMMENT_USER and delete the comments, if they exist
boolean commentsDeleted = commentManager.deleteCommentByHostUserAndSubject(host,
CUEBOT_COMMENT_USER, SUBJECT_COMMENT_FULL_TEMP_DIR);

if (commentsDeleted) {
// Set the host state to UP
hostManager.setHostState(host, HardwareState.UP);
host.hardwareState = HardwareState.UP;
return true;
}
}
return false;
}

/**
* Changes the NIMBY lock state. If the DB indicates a NIMBY lock
* but RQD does not, then the host is unlocked. If the DB indicates
Expand Down
7 changes: 3 additions & 4 deletions cuebot/src/main/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,10 @@ dispatcher.report_queue.max_pool_size=8
# Queue capacity for handling Host Report.
dispatcher.report_queue.queue_capacity=1000

# The minimum amount of free space in the temporary directory (mcp) to book a host.
# E.g: 1G = 1048576 kB => dispatcher.min_bookable_free_temp_dir_kb=1048576
# Default = -1 (deactivated)
# The minimum amount of available space on the temporary directory (mcp) to book a host.
# Hosts with less free space than the limit will be marked as REPAIR
# If equals to -1, it means the feature is turned off
dispatcher.min_bookable_free_temp_dir_kb=-1
dispatcher.min_available_temp_storage_percentage=20

# Number of threads to keep in the pool for kill frame operation.
dispatcher.kill_queue.core_pool_size=6
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,14 +297,16 @@ public void testHandleHostReportWithFullTemporaryDirectories() {
* Precondition:
* - HardwareState=UP
* Action:
* - Receives a HostReport with freeTempDir < dispatcher.min_bookable_free_temp_dir_kb (opencue.properties)
* - Receives a HostReport with less freeTempDir than the threshold
* (opencue.properties: min_available_temp_storage_percentage)
* Postcondition:
* - Host hardwareState changes to REPAIR
* - A comment is created with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and user=CUEBOT_COMMENT_USER
* - A comment is created with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
* user=CUEBOT_COMMENT_USER
* */
// Create HostReport
// Create HostReport with totalMcp=4GB and freeMcp=128MB
HostReport report1 = HostReport.newBuilder()
.setHost(getRenderHostBuilder(hostname).setFreeMcp(1024L).build())
.setHost(getRenderHostBuilder(hostname).setFreeMcp(CueUtil.MB128).build())
.setCoreInfo(cores)
.build();
// Call handleHostReport() => Create the comment with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and change the
Expand Down Expand Up @@ -337,9 +339,11 @@ public void testHandleHostReportWithFullTemporaryDirectories() {
* Test 2:
* Precondition:
* - HardwareState=REPAIR
* - There is a comment for the host with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and user=CUEBOT_COMMENT_USER
* - There is a comment for the host with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and
* user=CUEBOT_COMMENT_USER
* Action:
* - Receives a HostReport with freeTempDir >= dispatcher.min_bookable_free_temp_dir_kb (opencue.properties)
* Receives a HostReport with more freeTempDir than the threshold
* (opencue.properties: min_available_temp_storage_percentage)
* Postcondition:
* - Host hardwareState changes to UP
* - Comment with subject=SUBJECT_COMMENT_FULL_TEMP_DIR and user=CUEBOT_COMMENT_USER gets deleted
Expand Down
2 changes: 1 addition & 1 deletion cuebot/src/test/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ dispatcher.kill_queue.queue_capacity=1000
dispatcher.booking_queue.core_pool_size=6
dispatcher.booking_queue.max_pool_size=6
dispatcher.booking_queue.queue_capacity=1000
dispatcher.min_bookable_free_temp_dir_kb=1048576
dispatcher.min_available_temp_storage_percentage=20
dispatcher.min_bookable_free_mcp_kb=1048576
dispatcher.oom_max_safe_used_memory_threshold=0.95
dispatcher.oom_frame_overboard_allowed_threshold=0.6
Expand Down

0 comments on commit efa22a4

Please sign in to comment.