forked from hackbio-ca/disease-susceptibility-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_in_file.py
66 lines (52 loc) · 2 KB
/
read_in_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 28 15:48:26 2024
@author: Owner
"""
import pandas as pd
import os
# Set the default directory
os.chdir(r'C:\Users\Owner\Documents\GitHub\disease-susceptibility-analysis')
# Original CSV file name
original_file = '1000genomesprojectphase3-PopulationGenotypes-Homo_sapiens_Variation_Population_rs10841302.csv'
# Function to extract rs_number from the filename
def extract_rs_number(filename):
start = filename.find('rs')
if start != -1:
end = filename.find('.', start) # Locate the end of the rs_number before the file extension
if end != -1:
return filename[start:end] # Extract 'rs' and the following digits
return None
# Extract rs_number from the file name
rs_number = extract_rs_number(original_file)
# Load your data (assuming the data is in a CSV format with no header. Adjust 'header=0' if there's a header)
df = pd.read_csv(original_file, header=None)
# Select only the first and third columns
df = df[[0, 2]]
# Rename columns for clarity
df.columns = ['Column1', 'Column2']
# Function to extract country code from the first column
def extract_country_code(col):
return col.split(';')[0]
# Function to extract the T number from the second column
def extract_t_number(col):
parts = col.split('C:')
if len(parts) > 1:
t_part = parts[1].split()[0] # Extract the number after 'T:'
return t_part
return None
# Apply the functions to the respective columns
df['Country_Code'] = df['Column1'].apply(extract_country_code)
df['T_Number'] = df['Column2'].apply(extract_t_number)
# Select only the necessary columns
output_df = df[['Country_Code', 'T_Number']]
# Generate the output CSV filename based on the rs_number
if rs_number:
output_filename = f'{rs_number}.csv'
# Save the result to a new CSV
output_df.to_csv(output_filename, index=False)
print(f"Saved to {output_filename}")
else:
print("rs_number not found in the file name")
# Print the first few rows to verify
print(output_df.head())