Skip to content

Commit

Permalink
Merge pull request #407 from StoXProject/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
arnejohannesholmin authored Nov 29, 2024
2 parents 7d0f2c4 + e005ad4 commit 175de88
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 183 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: RstoxData
Version: 2.1.0
Date: 2024-11-06
Version: 2.1.1-9001
Date: 2024-11-29
Title: Tools to Read and Manipulate Fisheries Data
Authors@R: c(
person(given = "Edvin",
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# RstoxData v2.1.1_9001 (2024-11-25)
* Removed rscript_args from mapplyOnCores(), since this caused the Renviron to be ignored in the sub processes, resulting in errors with finding RstoxFramework in the case of multiple libraries on Linux and macOS.
* Fixed the 29 cm bug again, as it was not properly fixed in RstoxData v2.1.0. Refactored how precision is set both when reading data and in the ICESBiotic() and ICESAcoustic().
* Introduced the EchoType as a column in the Data table of ICESBiotic() (and WriteICESBiotic()).


# RstoxData v2.1.0 (2024-11-04)
* Final release for StoX 4.1.0.
* Fixed bug in DefineTranslation, where the ConditionalVariableNames was showing as a single string and not a vector.
Expand Down
18 changes: 18 additions & 0 deletions R/Definitions.R
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,24 @@ initiateRstoxData <- function(){
lengthCode_unit_table[, rank := seq_len(.N)]


# Define conversion functions from xml types:
conversionFunctionName <- list(
"xsd:ID" = "as.character",
"xsd:float" = "as.double",
"xs:string" = "as.character",
"xsd:string" = "as.character",
#"xsd:int" = "integer",
"xsd:int" = "asIntegerAfterRound",
"xs:long" = "asIntegerAfterRound",
#"xs:integer" = "integer",
"xs:integer" = "asIntegerAfterRound",
"xs:decimal" = "as.double",
"xs:date" = "as.character",
"xs:time" = "as.character",
"xs:double" = "as.double"
)


#### Assign to RstoxDataEnv and return the definitions: ####
definitionsNames <- ls()
definitions <- lapply(definitionsNames, get, pos = environment())
Expand Down
114 changes: 4 additions & 110 deletions R/StoxExport.R
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ checkAndCreateICESAcousticCSV <- function(ICESAcousticDataOne) {
checkICESAcousticDefinitions (ICESAcousticDataOne)

# Set classes of the variables, especially taking care of NAs. The class of the variables is used later to format the output from WriteICESAcoustic, where NA double type is stored as empty sting to support these beingg empty fields in the written file:
setClassICESAcoustic(ICESAcousticDataOne)
lapply(names(ICESAcousticDataOne), setClass_OneTable, ICESAcousticDataOne, RstoxData::xsdObjects$icesAcoustic.xsd)


#### Rename columns to start with the table name:
Expand Down Expand Up @@ -595,7 +595,7 @@ BioticDataToICESBioticOne <- function(
ICESBioticDataOne <- ICESBioticDataOne[hierarchicalTables]

# Set classes of the variables, especially taking care of NAs. The class of the variables is used later to format the output from WriteICESBiotic, where NA double type is stored as empty sting to support these being empty fields in the written file:
setClassICESBiotic(ICESBioticDataOne)
lapply(names(ICESBioticDataOne), setClass_OneTable, ICESBioticDataOne, RstoxData::xsdObjects$icesBiotic.xsd)


# Create a table of the original and new column names, but remove keys:
Expand Down Expand Up @@ -748,6 +748,8 @@ BioticData_NMDToICESBioticOne <- function(
# We must filter records with aphia == NA
catchRaw <- catchRaw[!is.na(aphia)]



Catch <- catchRaw[, .(
LocalID = cruise,
Gear = gear,
Expand Down Expand Up @@ -862,9 +864,6 @@ BioticData_NMDToICESBioticOne <- function(
}





ICESBioticCSV <- list(
Cruise = Cruise,
Haul = Haul,
Expand All @@ -878,117 +877,12 @@ BioticData_NMDToICESBioticOne <- function(



setClassICESBiotic <- function(data, tables = names(data), xsd = RstoxData::xsdObjects$icesBiotic.xsd) {
setClassICES(data, xsd, tables = names(data))
}

setClassICESAcoustic <- function(data, tables = names(data), xsd = RstoxData::xsdObjects$icesAcoustic.xsd) {
setClassICES(data, xsd, tables = names(data))
}



setClassICES <- function(data, xsd, tables = names(data)) {
# Get the classes per table:
classes <- mapply(
structure,
lapply(
xsd$tableTypes[tables],
translateSimple,
old = c(
"xsd:float",
"xsd:int",
"xsd:string",
"xs:string",
"xsd:ID"
),
new = c(
"numeric",
"integer",
"character",
"character",
"character"
)
),
names = xsd$tableHeaders[tables],
SIMPLIFY = FALSE
)
classes <- lapply(classes, as.list)

for(table in tables) {
data[[table]] <- data[[table]][, lapply(names(.SD), changeClassOfNonNA, classes = classes[[table]], data = data[[table]])]
}
}

translateSimple <- function(x, old, new) {
if(length(old) != length(new)) {
stop("old and new need to be of equal length.")
}
for(ind in seq_along(old)) {
x <- replace(x, x == old[ind], new[ind])
}
return(x)
}


changeClassOfNonNA <- function(name, classes, data) {
if(name %in% names(data) && name %in% names(classes) && firstClass(data[[name]]) != classes[[name]]) {
thisClass <- classes[[name]]
if(all(is.na(data[[name]]))) {
NAToInsert <- getNAByType(thisClass)
data[, c(name) := ..NAToInsert]
}
else {
# Removed this by a special function that converts to integer after rounding to avoid problems like trunc(0.29 * 100) == 28:
# I.e., convertion from float to integer performs in the same way as trunc(), which has problems with floating numbers. For the fish lengths that are relevant we have the problem for the following values:
# int <- seq_len(150)
# d <- data.table::data.table(int = int, equalToFloat = trunc(int / 100 * 100) == int)
# subset(d, !equalToFloat)
#data[, c(name) := get(paste("as", thisClass, sep = "."))(get(name))]
# int equalToFloat
# <int> <lgcl>
# 1: 29 FALSE
# 2: 57 FALSE
# 3: 58 FALSE
# 4: 113 FALSE
# 5: 114 FALSE
# 6: 115 FALSE
# 7: 116 FALSE
# So the problem is particularly for fish of length 29 cm, which were truncated to 28 cm when submitting to ICES:
data[, c(name) := get(getConversionFunction(thisClass))(get(name))]
}
}
}


getConversionFunction <- function(class) {
atInteger <- class %in% "integer"
out <- paste("as", class, sep = ".")
out[atInteger] <- "asIntegerAfterRound"
return(out)
}


asIntegerAfterRound <- function(x, prec = .Machine$double.eps) {
# This operation requires that the input can be represented as numeric, so we test that first by observing whether the number of missing values increases:
x_numeric <- as.numeric(x)
numberOfNAs <- sum(is.na(x))
numberOfNAs_numeric <- sum(is.na(x_numeric))
if(numberOfNAs_numeric > numberOfNAs) {
warning("StoX: NAs introduced when trying to convert to integer.")
return(as.integer)
}

# Convert to integer:
x_integer <- as.integer(x)
# Find values which differ to the integer value by less than the input precision, and round these off before converting to integer to avoid occasional shifts in integer value due to floating point representation (e.g. as.integer(0.29 * 100) == 28):
atSmallDiff <- which(abs(x_numeric - x_integer) <= prec)

# Convert to integer, but for values that differ to the integer value by more than
x_integer[atSmallDiff] <- as.integer(round(x_numeric[atSmallDiff]))

return(x_integer)
}

#' Write ICESBiotic to CSV fille
#'
Expand Down
124 changes: 123 additions & 1 deletion R/Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@ mapplyOnCores <- function(FUN, NumberOfCores = 1L, ..., MoreArgs = NULL, SIMPLIF

# On Windows run special args to speed up:
#if(get_os() == "win") {
cl <- parallel::makeCluster(NumberOfCores, rscript_args = c("--no-init-file", "--no-site-file", "--no-environ"))

# Removed the rscript_args, because it changes the envoronment compared to the partent environment:
#cl <- parallel::makeCluster(NumberOfCores, rscript_args = c("--no-init-file", "--no-site-file", "--no-environ"))
cl <- parallel::makeCluster(NumberOfCores)
out <- parallel::clusterMap(cl, FUN, ..., MoreArgs = MoreArgs, SIMPLIFY = SIMPLIFY)
parallel::stopCluster(cl)
#}
Expand Down Expand Up @@ -1052,3 +1055,122 @@ do.call_robust <- function(what, args, quote = FALSE, envir = parent.frame(), ke
}



























# Process column names and types
setNames_OneTable <- function(tableName, data, xsd) {

# For convenience extract the tableHeader of the current table:
tableHeader <- xsd$tableHeaders[[tableName]]

# There are duplicated column names in NMDBiotic 1, 1.1 and 1.4. we suffix the table name to those fields:
if(anyDuplicated(tableHeader)) {
dup <- duplicated(tableHeader)
tableHeader[dup] <- paste(tableHeader[dup], tableName, sep = ".")
}

# Handle empty data. This is only relevant for NMDBiotic and NMDAcoustic, which both have levels with no data (i.e. "missions" and "distance_list", respectively):
if(!length(data[[tableName]])) {
data[[tableName]] <- matrix(data = "", nrow = 0, ncol = length(tableHeader))
}

# Convert to data.table
output <- data.table(data[[tableName]])

# Set column names
setnames(output, tableHeader)

return(output)
}





# Set types of the columns of the table named 'tableName' of 'data'. Note that this only considers the columns with names present in the xsd$tableHeader. For ICES formats the keys are not included in the tableHeader, but all keys are character
setClass_OneTable <- function(tableName, data, xsd) {

# Known atomic data types
conversionFunctionName <- getRstoxDataDefinitions("conversionFunctionName")

# Set column types (only double and integer for now)
tableHeader <- xsd$tableHeader[[tableName]]
tableType <- xsd$tableTypes[[tableName]]
if(length(tableType) > 0) {
for(i in seq_along(tableHeader)) {
# Map the types
doConv <- eval(
parse(
text = conversionFunctionName[[tableType[i]]]
)
)

# Throw a proper warning when conversion fails:
tryCatch(
data[[tableName]][, tableHeader[i] := doConv(data [[tableName]] [[tableHeader[i]]] ) ],
error = function(e) {
e
},
warning = function(w) {
modifiedWarning <- paste0("The following variable could not converted to numeric as per the format definition and were set to NA: ", names(data[[tableName]])[i])
warning(modifiedWarning)
}
)
}
}

invisible(tableName)
}


asIntegerAfterRound <- function(x, prec = sqrt(.Machine$double.eps)) {
# This operation requires that the input can be represented as numeric, so we test that first by observing whether the number of missing values increases:
x_numeric <- as.numeric(x)
x_integer <- as.integer(x)

# Detect whether the input is not fully convertible to integer, which we assume is the case if there are mote missing values in the x_numeric, in which case we simply return the x_integer:
numberOfNAs <- sum(is.na(x))
numberOfNAs_numeric <- sum(is.na(x_numeric))
if(numberOfNAs_numeric > numberOfNAs) {
warning("StoX: NAs introduced when trying to convert to integer.")
return(x_integer)
}

# Convert to integer:
x_integer <- as.integer(x)
x_rounded <- round(x_numeric)
# Find values which differ to the integer value by less than the input precision, and round these off before converting to integer to avoid occasional shifts in integer value due to floating point representation (e.g. as.integer(0.29 * 100) == 28):
diff <- x_numeric - x_rounded
atSmallDiff <- which(diff < 0 & -diff <= prec)

# Convert to integer after rounding for values that differ to the integer value by less than the prec:
x_integer[atSmallDiff] <- as.integer(x_rounded[atSmallDiff])

return(x_integer)
}



Loading

0 comments on commit 175de88

Please sign in to comment.