Skip to content

Commit

Permalink
[DO NOT MERGE] Detect file type
Browse files Browse the repository at this point in the history
Rustcore should have functionality to detect the file type. If
file type is binary or text, and if text what is the encoding of
file.

Resolves: #1841
  • Loading branch information
sudeeptarlekar committed Oct 23, 2023
1 parent ac4c8fd commit ac37937
Showing 1 changed file with 74 additions and 0 deletions.
74 changes: 74 additions & 0 deletions application/apps/indexer/indexer_cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1799,3 +1799,77 @@ async fn detect_messages_type(input: &Path) -> Result<bool, DltParseError> {
)))
}
}

// Idea is to get the file path from the user,
// read the first chunks from the file and return String
// 'binary' or 'text' as of now. Later we can add enum for file
// type.
async fn detect_file_type(file_path: &Path) -> String {
let chunks = fetch_starting_chunk(file_path);
let buffer = match chunks {
Ok(buffer) => buffer,
// TODO: add correct error handling
Err(error) => panic!("Error while reading chunks from file: {}", error),
};
// println!("{:?}", buffer);
let is_binary: bool = is_file_binary(&buffer);

println!("Still in progress... and is likely binary {}", is_binary);

let result = std::str::from_utf8(&buffer);
let content: &str = match result {
Ok(file_content) => file_content,
Err(err) => panic!(
"Error while parsing file using `from_utf8` method from std::str {:?} ",
err
),
};

println!("Still in progress.... and content of the file {}", content);

if is_binary {
String::from("The file is likely binary with content\n") + content
} else {
String::from("The file is not likely binary with content\n") + content
}
}

fn fetch_starting_chunk(file_path: &Path) -> Result<Vec<u8>> {
let file = File::open(file_path)?;
let mut file = file.take(1024);
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
Ok(buffer)
}

fn is_printable(char_encoding: &u8) -> bool {
return match char_encoding {
32..=126 => true,
b'\r' | b'\n' | b'\t' => true,
_ => false,
};
}

fn is_high_ascii(char_encoding: &u8) -> bool {
match char_encoding {
127..=255 => true,
_ => false,
}
}

fn is_file_binary(buffer: &Vec<u8>) -> bool {
if buffer.is_empty() {
return false;
}

let buffer_length: f32 = buffer.len() as f32;

let non_printable_chars_count = buffer.iter().filter(|x| !is_printable(x)).count() as f32;
let non_ascii_ratio = non_printable_chars_count / buffer_length;

let high_ascii_count = buffer.iter().filter(|x| !is_high_ascii(x)).count() as f32;
let high_ascii_ratio = high_ascii_count / buffer_length;

(non_ascii_ratio > 0.3 && high_ascii_ratio < 0.05)
|| (non_ascii_ratio > 0.8 && high_ascii_ratio > 0.8)
}

0 comments on commit ac37937

Please sign in to comment.