From 9899a747d29bdeb717f796624bcbd6ed2c7c02fa Mon Sep 17 00:00:00 2001 From: sudeeptarlekar Date: Wed, 18 Oct 2023 11:21:59 +0200 Subject: [PATCH] [DO NOT MERGE] Detect file type Rustcore should have functionality to detect the file type. If file type is binary or text, and if text what is the encoding of file. Resolves: #1841 --- .../apps/indexer/indexer_cli/src/main.rs | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/application/apps/indexer/indexer_cli/src/main.rs b/application/apps/indexer/indexer_cli/src/main.rs index 79ad342ab..800a9b7a9 100644 --- a/application/apps/indexer/indexer_cli/src/main.rs +++ b/application/apps/indexer/indexer_cli/src/main.rs @@ -1799,3 +1799,60 @@ async fn detect_messages_type(input: &Path) -> Result { ))) } } + +// Idea is to get the file path from the user, +// read the first chunks from the file and return String +// 'binary' or 'text' as of now. Later we can add enum for file +// type. +async fn detect_file_type(file_path: &Path) -> String { + let chunks = fetch_starting_chunk(file_path); + let buffer = match chunks { + Ok(buffer) => buffer, + // TODO: add correct error handling + Err(error) => panic!("Error while reading chunks from file: {}", error), + }; + // println!("{:?}", buffer); + let result: bool = is_file_binary(buffer); + + return String::from("Still in progress..."); +} + +fn fetch_starting_chunk(file_path: &Path) -> Result>{ + let file = File::open(file_path)?; + let mut file = file.take(1024); + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + Ok(buffer) +} + +fn is_printable(char_encoding: &u8) -> bool { + return match char_encoding { + 32..=126 => true, + b'\r' | b'\n' | b'\t' => true, + _ => false + }; +} + +fn is_high_ascii(char_encoding: &u8) -> bool { + match char_encoding { + 127..=255 => true, + _ => false, + } +} + +fn is_file_binary(buffer: Vec) -> bool { + if buffer.is_empty() { + return false; + } + + let buffer_length: f32 = buffer.len() as f32; + + let non_printable_chars_count = buffer.iter().filter(|x| !is_printable(x)).count() as f32; + let non_ascii_ratio = non_printable_chars_count / buffer_length; + + let high_ascii_count = buffer.iter().filter(|x| !is_high_ascii(x)).count() as f32; + let high_ascii_ratio = high_ascii_count / buffer_length; + + (non_ascii_ratio > 0.3 && high_ascii_ratio < 0.05) || + (non_ascii_ratio > 0.8 && high_ascii_ratio > 0.8) +}