feat: add subcli replace and transpose

sharkLoc · Apr 26, 2024 · bcfc1e5 · bcfc1e5
1 parent 8343da6
commit bcfc1e5
Show file tree

Hide file tree

Showing 8 changed files with 187 additions and 13 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "xtab"
-version = "0.0.4"
+version = "0.0.5"
 authors = ["sharkLoc <[email protected]>"]
 edition = "2021"
 homepage = "https://github.com/sharkLoc/xtab"

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ cargo b --release
 
 ```bash
 xtab -- CSV command line utilities
-Version: 0.0.4
+Version: 0.0.5
 
 Authors: sharkLoc <[email protected]>
 Source code: https://github.com/sharkLoc/xtab.git
@@ -46,30 +46,29 @@ Commands:
   flatten    flattened view of CSV records [aliases: flat]
   freq       Build frequency table of selected column in CSV data
   head       Print first N records from CSV file
-  pretty     Convert CSV to a readable aligned table
+  pretty     Convert CSV to a readable aligned table [aliases: prt]
+  replace    Replace data of matched fields
   reverse    Reverses rows of CSV data [aliases: rev]
   sample     Randomly select rows from CSV file using reservoir sampling
   search     Applies the regex to each field individually and shows only matching rows
   slice      Slice rows from a part of a CSV file
   tail       Print last N records from CSV file
+  transpose  Transpose CSV data [aliases: trans]
   uniq       Unique data with keys
   xlsx2csv   Convert XLSX to CSV format [aliases: x2c]
   view       Show CSV file content
   help       Print this message or the help of the given subcommand(s)
 
-
 Options:
-  -h, --help     Print help (see more with '--help')
-  -V, --version  Print version
+  -h, --help  Print help (see more with '--help')
 
 Global Arguments:
-  -o, --out <FILE>            Output file name, file ending in .gz/.bz2/.xz will be compressed automatically, if file not specified write data to stdout
-  -d, --delimiter <CHAR>      Set delimiter for input csv file, e.g., -d $'\t' for tab [default: ,]
-  -D, --out-delimite <CHAR>   Set delimiter for output CSV file, e.g., -D $'\t' for tab [default: ,]
+  -d, --delimiter <CHAR>      Set delimiter for input csv file, e.g., in linux -d $'\t' for tab, in powershell -d `t for tab [default: ,]
+  -D, --out-delimite <CHAR>   Set delimiter for output CSV file, e.g., in linux -D $'\t' for tab, in powershell -D `t for tab [default: ,]
       --log <FILE>            If file name specified, write log message to this file, or write to stderr
       --compress-level <INT>  Set compression level 1 (compress faster) - 9 (compress better) for gzip/bzip2/xz output file, just work with option -o/--out [default: 6]
   -v, --verbosity <STR>       Control verbosity of logging, possible values: {error, warn, info, debug, trace} [default: debug]
-  [CSV]                   Input csv file name, if file not specified read data from stdin
+  [CSV]                   Input CSV file name, if file not specified read data from stdin
 
 Global FLAGS:
   -H, --no-header  If set, the first row is treated as a special header row, and the original header row excluded from output

diff --git a/src/args.rs b/src/args.rs
@@ -6,7 +6,7 @@ use clap::{value_parser, Parser};
 #[command(
     name = "xtab",
     author = "sharkLoc",
-    version = "0.0.4",
+    version = "0.0.5",
     next_line_help = false,
     about = "CSV command line utilities",
     long_about = "A simple and cross-platform program for CSV file manipulation"
@@ -152,6 +152,7 @@ pub enum Cmd {
     },
 
     /// Convert CSV to a readable aligned table
+    #[command(visible_alias = "prt")]
     pretty {
         /// Set the whole table width
         #[arg(short = 'w', long = "width-table", value_name = "INT", value_parser = value_parser!(u16).range(0..=65535))]
@@ -167,6 +168,25 @@ pub enum Cmd {
         header: bool,
     },
 
+    /// Replace data of matched fields
+    replace {
+        /// Select columns index, e.g -c 2,3,5
+        #[arg(short = 'c', long = "col-index", value_name = "STR", default_value_t = String::from("1"))]
+        col_index: String,
+        /// Raw cell content 
+        #[arg(short = 's', long = "src", value_name = "STR")]
+        src: String,
+        /// New cell content 
+        #[arg(short = 'd', long = "dst", value_name = "STR")]
+        dst: String,
+        /// If set, replace data in whole CSV file, overwrite option -c
+        #[arg(short = 'a', long = "all", help_heading = Some("FLAGS"))]
+        all: bool,
+        /// Output file name, file ending in .gz/.bz2/.xz will be compressed automatically, if file not specified write data to stdout
+        #[arg(short = 'o', long = "out", value_name = "FILE")]
+        output: Option<PathBuf>,
+    },
+
     /// Reverses rows of CSV data
     #[command(visible_alias = "rev")]
     reverse {
@@ -236,6 +256,14 @@ pub enum Cmd {
         output: Option<PathBuf>,
     },
 
+    /// Transpose CSV data
+    #[command(visible_alias = "trans")]
+    transpose {
+        /// Output file name, file ending in .gz/.bz2/.xz will be compressed automatically, if file not specified write data to stdout
+        #[arg(short = 'o', long = "out", value_name = "FILE")]
+        output: Option<PathBuf>,
+    },
+
     /// Unique data with keys
     uniq {
         /// Select these fields as keys. e.g -k 2,3,5

diff --git a/src/command/mod.rs b/src/command/mod.rs
@@ -6,6 +6,7 @@ pub mod flatten;
 pub mod freq;
 pub mod head;
 pub mod pretty;
+pub mod replace;
 pub mod reverse;
 pub mod sample;
 pub mod search;
@@ -14,6 +15,7 @@ pub mod slice;
 //pub mod split;
 //pub mod stats;
 pub mod tail;
+pub mod transpose;
 pub mod uniq;
 pub mod view;
 pub mod xlsx2csv;
diff --git a/src/command/replace.rs b/src/command/replace.rs
@@ -0,0 +1,85 @@
+use crate::utils::*;
+use anyhow::{Error, Ok};
+use csv::{ReaderBuilder, StringRecord, WriterBuilder};
+use log::*;
+use std::{path::PathBuf, time::Instant};
+
+
+pub fn replace_csv(
+    no_header: bool,
+    delimiter: u8,
+    out_delimiter: u8,
+    index_str: &str,
+    src: &str,
+    dst: &str,
+    all: bool,
+    csv: Option<PathBuf>,
+    csvo: Option<PathBuf>,
+    compression_level: u32,
+) -> Result<(), Error> {
+    let start = Instant::now();
+
+    let mut csv_reader = ReaderBuilder::new()
+        .has_headers(no_header)
+        .flexible(true)
+        .delimiter(delimiter)
+        .from_reader(file_reader(csv.as_ref())?);
+
+    let mut col_index = vec![];
+    for idx in index_str.split(',').collect::<Vec<&str>>() {
+        let idx = idx.parse::<usize>()?;
+        if col_index.contains(&idx) {
+            warn!("duplicate columns index {}, keep first one", idx);
+            continue;
+        } else {
+            col_index.push(idx);
+        }
+        if idx == 0 {
+            error!("col_index error : {}, start from 1", idx);
+            std::process::exit(1);
+        }
+    }
+
+    match csv {
+        Some(csv) => info!("read file from: {:?}", csv),
+        None => info!("read file from stdin "),
+    }
+
+    let mut csv_writer = WriterBuilder::new()
+        .has_headers(no_header)
+        .delimiter(out_delimiter)
+        .from_writer(file_writer(csvo.as_ref(), compression_level)?);
+
+    let mut rec_new = StringRecord::new();
+    let mut count = 0usize;
+    for rec in csv_reader.records().flatten() {
+        for (idx,each) in rec.iter().enumerate() {
+            if all {
+                if each == src {
+                    rec_new.push_field(dst);
+                    count += 1;
+                } else {
+                    rec_new.push_field(each);
+                }
+            } else {
+                if col_index.contains(&(idx + 1)) {
+                    if each == src {
+                        rec_new.push_field(dst);
+                        count += 1;
+                    } else {
+                        rec_new.push_field(each);
+                    }
+                } else {
+                    rec_new.push_field(each);
+                }
+            }
+        }
+        csv_writer.write_record(&rec_new)?;
+        rec_new.clear();
+    }
+    csv_writer.flush()?;
+
+    info!("total replace cell count: {}", count);
+    info!("time elapsed is: {:?}", start.elapsed());
+    Ok(())
+}
diff --git a/src/command/transpose.rs b/src/command/transpose.rs
@@ -0,0 +1,54 @@
+use crate::utils::*;
+use anyhow::{Error, Ok};
+use csv::{ReaderBuilder, StringRecord, WriterBuilder};
+use log::*;
+use std::{collections::HashMap, path::PathBuf, time::Instant};
+
+
+pub fn transpose_csv(
+    no_header: bool,
+    delimiter: u8,
+    out_delimiter: u8,
+    csv: Option<PathBuf>,
+    csvo: Option<PathBuf>,
+    compression_level: u32,
+) -> Result<(), Error> {
+    let start = Instant::now();
+
+    let mut csv_reader = ReaderBuilder::new()
+        .has_headers(no_header)
+        .flexible(true)
+        .delimiter(delimiter)
+        .from_reader(file_reader(csv.as_ref())?);
+
+    match csv {
+        Some(csv) => info!("read file from: {:?}", csv),
+        None => info!("read file from stdin "),
+    }
+
+    let mut df_hash: HashMap<usize, Vec<String>> = HashMap::new();
+    for rec in csv_reader.records().flatten() {
+        for (col, each) in rec.iter().enumerate() {
+            df_hash.entry(col).or_default().push(each.to_string());
+        }
+    }
+
+    let mut csv_writer = WriterBuilder::new()
+        .has_headers(no_header)
+        .delimiter(out_delimiter)
+        .from_writer(file_writer(csvo.as_ref(), compression_level)?);
+
+    let mut str_rec = StringRecord::new();
+    for i in 0..df_hash.len() {
+        let vec = df_hash.get(&i).unwrap();
+        for v in vec.iter() {
+            str_rec.push_field(v);
+        }
+        csv_writer.write_record(&str_rec)?;
+        str_rec.clear();
+    }
+    csv_writer.flush()?;
+
+    info!("time elapsed is: {:?}", start.elapsed());
+    Ok(())
+}
diff --git a/src/main.rs b/src/main.rs
@@ -7,7 +7,7 @@ mod command;
 mod utils;
 
 use command::{
-    addheader::addheader_csv, csv2xlsx::csv_xlsx, dim::dim_csv, drop::drop_csv, flatten::flatten_csv, freq::freq_csv, head::head_csv, pretty::pretty_csv, reverse::reverse_csv, sample::sample_csv, search::search_csv, slice::slice_csv, tail::tail_csv, uniq::uniq_csv, view::view_csv, xlsx2csv::xlsx_csv
+    addheader::addheader_csv, csv2xlsx::csv_xlsx, dim::dim_csv, drop::drop_csv, flatten::flatten_csv, freq::freq_csv, head::head_csv, pretty::pretty_csv, replace::replace_csv, reverse::reverse_csv, sample::sample_csv, search::search_csv, slice::slice_csv, tail::tail_csv, transpose::transpose_csv, uniq::uniq_csv, view::view_csv, xlsx2csv::xlsx_csv
 };
 
 
@@ -66,6 +66,12 @@ fn main() -> Result<(), Error>{
         args::Cmd::search { pat, case, invert, output } => {
             search_csv(cmd.no_header, cmd.delimiter as u8, cmd.out_delimite as u8, case, invert, &pat, cmd.input, output, cmd.compression_level)?;
         }
+        args::Cmd::transpose { output } => {
+            transpose_csv(cmd.no_header, cmd.delimiter as u8, cmd.out_delimite as u8, cmd.input, output, cmd.compression_level)?;
+        }
+        args::Cmd::replace { col_index, src, dst, all, output } => {
+            replace_csv(cmd.no_header, cmd.delimiter as u8, cmd.out_delimite as u8, &col_index, &src, &dst, all, cmd.input, output, cmd.compression_level)?;
+        }
     }
 
     Ok(())