Skip to content

Commit

Permalink
[DO NOT MERGE] Detect file type
Browse files Browse the repository at this point in the history
Rustcore should have functionality to detect the file type. If
file type is binary or text, and if text what is the encoding of
file.

Resolves: #1841
  • Loading branch information
sudeeptarlekar committed Oct 19, 2023
1 parent ac4c8fd commit 9899a74
Showing 1 changed file with 57 additions and 0 deletions.
57 changes: 57 additions & 0 deletions application/apps/indexer/indexer_cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1799,3 +1799,60 @@ async fn detect_messages_type(input: &Path) -> Result<bool, DltParseError> {
)))
}
}

// Idea is to get the file path from the user,
// read the first chunks from the file and return String
// 'binary' or 'text' as of now. Later we can add enum for file
// type.
async fn detect_file_type(file_path: &Path) -> String {
let chunks = fetch_starting_chunk(file_path);
let buffer = match chunks {
Ok(buffer) => buffer,
// TODO: add correct error handling
Err(error) => panic!("Error while reading chunks from file: {}", error),
};
// println!("{:?}", buffer);
let result: bool = is_file_binary(buffer);

return String::from("Still in progress...");
}

fn fetch_starting_chunk(file_path: &Path) -> Result<Vec<u8>>{
let file = File::open(file_path)?;
let mut file = file.take(1024);
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(buffer)
}

fn is_printable(char_encoding: &u8) -> bool {
return match char_encoding {
32..=126 => true,
b'\r' | b'\n' | b'\t' => true,
_ => false
};
}

fn is_high_ascii(char_encoding: &u8) -> bool {
match char_encoding {
127..=255 => true,
_ => false,
}
}

fn is_file_binary(buffer: Vec<u8>) -> bool {
if buffer.is_empty() {
return false;
}

let buffer_length: f32 = buffer.len() as f32;

let non_printable_chars_count = buffer.iter().filter(|x| !is_printable(x)).count() as f32;
let non_ascii_ratio = non_printable_chars_count / buffer_length;

let high_ascii_count = buffer.iter().filter(|x| !is_high_ascii(x)).count() as f32;
let high_ascii_ratio = high_ascii_count / buffer_length;

(non_ascii_ratio > 0.3 && high_ascii_ratio < 0.05) ||
(non_ascii_ratio > 0.8 && high_ascii_ratio > 0.8)
}

0 comments on commit 9899a74

Please sign in to comment.