Skip to content

Commit

Permalink
udpate gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
sedv8808 committed Aug 4, 2020
1 parent a1df1f3 commit 2498824
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ src/modules/preprocessing/__pycache__/neotoma_loader.cpython-37.pyc
src/modules/preprocessing/__pycache__/nlp_sentence_loader.cpython-37.pyc
src/modules/preprocessing/__pycache__/config.cpython-37.pyc
src/modules/preprocessing/__pycache__/preprocess_all_data.cpython-37.pyc
src/modules/preprocessing/__pycache__/config.cpython-37.pyc
*cpython-37.pyc
src/output/.ipynb_checkpoints/
.ipynb_checkpoints/
src/modules/preprocessing/__pycache__/config.cpython-37.pyc
Expand All @@ -74,5 +76,3 @@ src/output/bibliography.csv
src/output/nlp_sentences_df.csv
src/output/sentences_with_latlong_intersections.tsv
src/output/sentences_with_sitenames_intersections.tsv

Modelling_Coords_Vishal.ipynb
16 changes: 16 additions & 0 deletions data/neotoma_dummy.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
siteid,sitename,longitudeeast,latitudenorth,longitudewest,latitudesouth,altitude,area,sitedescription,notes,recdatecreated,recdatemodified,geog,datasetid,collectionunitid,datasettypeid,datasetname,notes-2,recdatecreated-2,recdatemodified-2,embargoid,citation,doi
10330,Lac du Sommet,-70.66468,47.71662,-70.66573,47.71382,830,2,"The small shallow Lac du Sommet (0.02 km2, 4 m maximum depth, elevation of 830 m a.s.l., 47°43′N, 70°40′W) is located in the boreal forest north of the St Lawrence Estuary (Figure 1). It is of glacial origin and situated on granitic-gneissic bedrock of the Canadian Precambrian Shield in the Laurentian Mountains.",,2015-04-16 19:02:09,2016-05-18 21:10:02,0103000020E61000000100000005000000D47D00529BAA51C0889D29745EDB47408A93FB1D8AAA51C0889D29745EDB47408A93FB1D8AAA51C06BD44334BADB4740D47D00529BAA51C06BD44334BADB4740D47D00529BAA51C0889D29745EDB4740,15691,11686,1,,,2015-04-16 19:02:13,2015-04-16 19:02:13,,"Hausmann, S., I. Larocque-Tobler, P.J.H. Richard, R. Pienitz, G. St-Onge, and F. Fye. 2011. Diatom-inferred wind activity at Lac du Sommet, southern Québec, Canada: A multiproxy paleoclimate reconstruction based on diatoms, chironomids, and pollen for the past 9500 years. The Holocene 21(6):925-938. http://hol.sagepub.com/content/21/6/925.full.pdf+html. [DOI: 10.1177/0959683611400199]",10.1177/0959683611400199
10330,Lac du Sommet,-70.66468,47.71662,-70.66573,47.71382,830,2,"The small shallow Lac du Sommet (0.02 km2, 4 m maximum depth, elevation of 830 m a.s.l., 47°43′N, 70°40′W) is located in the boreal forest north of the St Lawrence Estuary (Figure 1). It is of glacial origin and situated on granitic-gneissic bedrock of the Canadian Precambrian Shield in the Laurentian Mountains.",,2015-04-16 19:02:09,2016-05-18 21:10:02,0103000020E61000000100000005000000D47D00529BAA51C0889D29745EDB47408A93FB1D8AAA51C0889D29745EDB47408A93FB1D8AAA51C06BD44334BADB4740D47D00529BAA51C06BD44334BADB4740D47D00529BAA51C0889D29745EDB4740,15692,11686,11,Lac du Sommet Diatom Data,,2015-04-16 19:02:20,2015-04-16 19:02:20,,"Hausmann, S., I. Larocque-Tobler, P.J.H. Richard, R. Pienitz, G. St-Onge, and F. Fye. 2011. Diatom-inferred wind activity at Lac du Sommet, southern Québec, Canada: A multiproxy paleoclimate reconstruction based on diatoms, chironomids, and pollen for the past 9500 years. The Holocene 21(6):925-938. http://hol.sagepub.com/content/21/6/925.full.pdf+html. [DOI: 10.1177/0959683611400199]",10.1177/0959683611400199
1729,Myrtle Lake,-93.37853,47.98645,-93.39207,47.97876,393,50,"Lake surrounded by peatland. Physiography: Red Lake lowlands. Surrounding vegetation: Sphagnum, Piceto-Chamaedophnetum.",,2013-09-30 14:03:01,2016-05-18 21:10:02,0103000020E6100000010000000500000089EFC4AC175957C00551F70148FD474083A3E4D5395857C00551F70148FD474083A3E4D5395857C0D1915CFE43FE474089EFC4AC175957C0D1915CFE43FE474089EFC4AC175957C00551F70148FD4740,1786,1728,3,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Janssen, C.R. 1968. Myrtle Lake: a late- and post-glacial pollen diagram from northern Minnesota. Canadian Journal of Botany 46(11):1397-1410. [DOI: 10.1139/b68-190]",10.1139/b68-190
1729,Myrtle Lake,-93.37853,47.98645,-93.39207,47.97876,393,50,"Lake surrounded by peatland. Physiography: Red Lake lowlands. Surrounding vegetation: Sphagnum, Piceto-Chamaedophnetum.",,2013-09-30 14:03:01,2016-05-18 21:10:02,0103000020E6100000010000000500000089EFC4AC175957C00551F70148FD474083A3E4D5395857C00551F70148FD474083A3E4D5395857C0D1915CFE43FE474089EFC4AC175957C0D1915CFE43FE474089EFC4AC175957C00551F70148FD4740,8423,1728,1,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Janssen, C.R. 1968. Myrtle Lake: a late- and post-glacial pollen diagram from northern Minnesota. Canadian Journal of Botany 46(11):1397-1410. [DOI: 10.1139/b68-190]",10.1139/b68-190
269,Billy's Lake,-94.54948,46.27186,-94.55363,46.2693,383,2.5,Shallow depression on St. Croix moraine. Physiography: rugged relief. Surrounding vegetation: white pine/hardwoods.,,2013-09-30 14:03:01,2016-05-18 21:10:02,0103000020E61000000100000005000000680586AC6EA357C00A68226C78224740977329AE2AA357C00A68226C78224740977329AE2AA357C0978BF84ECC224740680586AC6EA357C0978BF84ECC224740680586AC6EA357C00A68226C78224740,275,269,3,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Jacobson, G.L., Jr., and E.C. Grimm. 1986. A numerical analysis of Holocene forest and prairie vegetation in central Minnesota. Ecology 67(4):958-966. [DOI: 10.2307/1939818]",10.2307/1939818
269,Billy's Lake,-94.54948,46.27186,-94.55363,46.2693,383,2.5,Shallow depression on St. Croix moraine. Physiography: rugged relief. Surrounding vegetation: white pine/hardwoods.,,2013-09-30 14:03:01,2016-05-18 21:10:02,0103000020E61000000100000005000000680586AC6EA357C00A68226C78224740977329AE2AA357C00A68226C78224740977329AE2AA357C0978BF84ECC224740680586AC6EA357C0978BF84ECC224740680586AC6EA357C00A68226C78224740,7929,269,1,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Jacobson, G.L., Jr., and E.C. Grimm. 1986. A numerical analysis of Holocene forest and prairie vegetation in central Minnesota. Ecology 67(4):958-966. [DOI: 10.2307/1939818]",10.2307/1939818
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,1649,1597,3,,Sample depths from core descriptions in LRC files. A reliable radiocarbon chronology does not seem possible (ECG).,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Craig, A.J. 1972. Pollen influx to laminated sediments: a pollen diagram from northeastern Minnesota. Ecology 53(1):46-57. [DOI: 10.2307/1935709]",10.2307/1935709
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,3482,3379,3,,Sample depths corrected in January 2018 by E.C. Grimm based on Appendix 1 in Craig's 1969 thesis.,2013-09-30 14:02:42,2018-01-25 23:58:54,,"Craig, A.J. 1972. Pollen influx to laminated sediments: a pollen diagram from northeastern Minnesota. Ecology 53(1):46-57. [DOI: 10.2307/1935709]",10.2307/1935709
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,3483,3380,3,,The pollen counts are from the same core as LKCLDSH. The actual depths are not known; samples were recorded as varve counts.,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Craig, A.J. 1972. Pollen influx to laminated sediments: a pollen diagram from northeastern Minnesota. Ecology 53(1):46-57. [DOI: 10.2307/1935709]",10.2307/1935709
203,Anderson Pond,-85.49868,36.03275,-85.50396,36.02755,303,34.8,,,2013-09-30 14:03:01,2016-05-18 21:10:02,0103000020E61000000100000005000000809F71E1406055C09D8026C286034240D5CA845FEA5F55C09D8026C286034240D5CA845FEA5F55C0D578E92631044240809F71E1406055C0D578E92631044240809F71E1406055C09D8026C286034240,203,203,3,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Delcourt, H.R. 1979. Late Quaternary vegetation history of the eastern Highland Rim and adjacent Cumberland Plateau of Tennessee. Ecological Monographs 49(3):255-280. [DOI: 10.2307/1942485]",10.2307/1942485
195,Hungry Jack Lake,-91.12,48.15,-91.12,48.15,453,,,,2013-09-30 14:03:01,2016-05-18 21:10:02,0101000020E610000048E17A14AEC756C03333333333134840,195,195,7,,,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Swain, A.M. 1973. A history of fire and vegetation in northeastern Minnesota as recorded in lake sediments. Quaternary Research 3(3):383-396. [DOI: 10.1016/0033-5894(73)900]",10.1016/0033-5894(73)900
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,1649,1597,3,,Sample depths from core descriptions in LRC files. A reliable radiocarbon chronology does not seem possible (ECG).,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Swain, A.M. 1973. A history of fire and vegetation in northeastern Minnesota as recorded in lake sediments. Quaternary Research 3(3):383-396. [DOI: 10.1016/0033-5894(73)900]",10.1016/0033-5894(73)900
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,3482,3379,3,,Sample depths corrected in January 2018 by E.C. Grimm based on Appendix 1 in Craig's 1969 thesis.,2013-09-30 14:02:42,2018-01-25 23:58:54,,"Swain, A.M. 1973. A history of fire and vegetation in northeastern Minnesota as recorded in lake sediments. Quaternary Research 3(3):383-396. [DOI: 10.1016/0033-5894(73)900]",10.1016/0033-5894(73)900
1598,Lake of the Clouds,-91.10962,48.14638,-91.11546,48.13873,462,11.6,"Lake with two basins. Physiography: rugged, steep slopes and cliffs. Surrounding vegetation: Pinus banksiana, Fraxinus, Acer, Alnus, Cornus.",Minnesota DNR Lake ID: 38016900.,2013-09-30 14:03:01,2017-12-18 19:05:37,0103000020E61000000100000005000000C1FF56B263C756C0AF7C96E7C111484034BF9A0304C756C0AF7C96E7C111484034BF9A0304C756C0ED647094BC124840C1FF56B263C756C0ED647094BC124840C1FF56B263C756C0AF7C96E7C1114840,3483,3380,3,,The pollen counts are from the same core as LKCLDSH. The actual depths are not known; samples were recorded as varve counts.,2013-09-30 14:02:42,2013-09-30 14:02:42,,"Swain, A.M. 1973. A history of fire and vegetation in northeastern Minnesota as recorded in lake sediments. Quaternary Research 3(3):383-396. [DOI: 10.1016/0033-5894(73)900]",10.1016/0033-5894(73)900
10461,Pigeon Marsh,-85.4009,34.66407,-85.40174,34.66311,660,0.5,"Small marsh located on Pigeon Mountain, the easern ridge of Lookout Mountain, a forested linear ridge. Marsh plants include Cephalanthus occidentalis, Salix, Scirpus cf. S. cyperinus, Carex tussocks, Persicaria hydropiperoides, Proserpinaca, Eleocharis, Bidens, Rhexia, Sagittaria, Sphagnum, Leucobryum, and Osmunda cinnamomea. Dominant tree around the edge is Acer rubrum and Liquidambar styraciflua. Upland vegetation dominated by Quercus and Carya with some Pinus virginiana. ",,2015-06-03 21:40:58,2016-05-18 21:10:02,0103000020E61000000100000005000000AF5FB01BB65955C046D3D9C9E0544140D93D7958A85955C046D3D9C9E0544140D93D7958A85955C09B20EA3E00554140AF5FB01BB65955C09B20EA3E00554140AF5FB01BB65955C046D3D9C9E0544140,16008,11849,1,,,2015-06-03 21:41:00,2015-06-03 21:41:00,,"Watts, W.A. 1975. Vegetation record for the last 20,000 years from a small marsh on Lookout Mountain, northwestern Georgia. Geological Society of America Bulletin 86(3):287-291. [DOI: 10.1130/0016-7606(1975)86<287:VRFTLY>2.0.CO;2]",10.1130/0016-7606(1975)86<287:VRFTLY>2.0.CO;2
159 changes: 159 additions & 0 deletions src/modules/preprocessing/eda_creator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import utils as ard
import os

def not_in_neotoma(df, df2, path = r'/Users/seiryu8808/Desktop/UWinsc/Github/UnacquiredSites/src/output/eda'):
"""Obtain all the article DOI's that are not in the Neotoma Database
Parameters
----------
df : pd.DataFrame
Input data frame
df2: pd.DataFrame
Input data frame
path: location where to write csv file
Returns
-------
pd.DataFrame with values that are not contained in the Neotoma Database
csv file in the output path with the mentioned DataFrame
"""
arts_not_in_neotoma = df[df['longeast'].isnull()]
arts_not_in_neotoma = arts_not_in_neotoma.groupby('_gddid')\
.agg({'longeast':'sum'})

arts_not_in_neotoma = arts_not_in_neotoma.merge(df2, on ='_gddid')
arts_not_in_neotoma = arts_not_in_neotoma[['_gddid', 'title', 'year', 'doi', 'link_url']]

output_file = os.path.join(path,'articles_wo_neotoma_coordinates.tsv')
arts_not_in_neotoma.to_csv(output_file, sep='\t', index = False)
print("A TSV file with articles not found in Neotoma was created on your EDA output folder.")
return arts_not_in_neotoma

def sentences_w_coords_int(df_with_int, path = r'/Users/seiryu8808/Desktop/UWinsc/Github/UnacquiredSites/src/output/eda'):
"""Obtain all the intersections between sentences in NLP df and Neotoma DB
Parameters
----------
df_with_int : pd.DataFrame
Input data frame where we want to look for intersections
path: location where to write csv file
Returns
-------
pd.DataFrame with coordinate intersections between sentences and neotoma database
csv file in the output path with the mentioned DataFrame
"""
# Output of sentences with lat and long intersections
sent_with_int_df = df_with_int[['_gddid','words', 'year', 'latnorth', 'found_lat', 'longeast', 'found_long', 'dms_regex', 'dd_regex']]
sent_with_int_df = sent_with_int_df.rename(columns={"latnorth":"expected_lat", 'longeast':'expected_long'})
output_file = os.path.join(path,'sentences_with_latlong_intersections.tsv')
sent_with_int_df.to_csv(output_file, sep='\t', index = False)
print("A TSV file with sentences that have coordinates was created in your EDA output folder.")
return sent_with_int_df

def articles_wo_coords(nlp_bib_neotoma, bibliography, neotoma_joined_df, path = r'/Users/seiryu8808/Desktop/UWinsc/Github/UnacquiredSites/src/output/eda'):
"""Obtain all article that have no coordinate intersections
Parameters
----------
df = nlp_bib_neotoma : pd.DataFrame
Input data frame
df2 = bibliography : pd.DataFrame
Input data frame
df3 = neotoma_joined_df :pd.DataFrame
Input data frame
path: location where to write csv file
Returns
-------
pd.DataFrame with articles that have no coordinates in the Neotoma Database
csv file in the output path with the mentioned DataFrame
"""
no_inter_df = nlp_bib_neotoma.groupby('_gddid')\
.agg({'found_lat':'sum', 'found_long':'sum'})\
.reset_index()

no_inter_df = no_inter_df[(no_inter_df['found_lat'].apply(len) == 0) & (no_inter_df['found_long'].apply(len) == 0 )]


no_inter_df = no_inter_df.merge(bibliography)
no_inter_df = no_inter_df.merge(neotoma_joined_df, how = 'left', left_on = 'doi', right_on = 'doi')\
.rename(columns={"latnorth": "expected_lat", "longeast": "expected_long"})
no_inter_df = no_inter_df[['_gddid', 'title', 'year','found_lat', 'expected_lat', 'found_long', 'expected_long', 'doi', 'link_url',]]
output_file = os.path.join(path,'articles_wo_latlong_intersections.tsv')
no_inter_df.to_csv(output_file, sep='\t', index = False)
print("A TSV file of articles that have no coordinates was created in your EDA output folder.")
return no_inter_df



def sentences_w_site_int(nlp_bib_neotoma, path = r'/Users/seiryu8808/Desktop/UWinsc/Github/UnacquiredSites/src/output/eda'):
"""Obtain all article that have no coordinate intersections
Parameters
----------
df = nlp_bib_neotoma : pd.DataFrame
Input data frame
path: location where to write csv file
Returns
-------
pd.DataFrame with intersections of sitenames and sentences df
csv file in the output path with the mentioned DataFrame
"""
sn_inter = ard.find_intersections(nlp_bib_neotoma, cols_to_intersect = ['words_l','sitenames_l'], new_col_name = 'found_sitenames')

sn_inter = sn_inter[sn_inter['found_sitenames'].str.len() != 0]

sn_inter = sn_inter[['_gddid', 'sentid', 'words_l', 'sitenames_l', 'found_sitenames', 'year']]
sn_inter = sn_inter.rename(columns={'sitenames_l':'expected_sitename','found_sitenames':'intersected_sitename'})
output_file = os.path.join(path,'sentences_with_sitenames_intersections.tsv')
sn_inter.to_csv(output_file, sep='\t', index = False)
print("A TSV file of sentences with Site intersections was created in your EDA output folder.")
return sn_inter

def articles_wo_sites(nlp_bib_neotoma, bibliography, neotoma_joined_df, path = r'/Users/seiryu8808/Desktop/UWinsc/Github/UnacquiredSites/src/output/eda'):
"""Obtain all article that have no sitenames intersections
Parameters
----------
df = nlp_bib_neotoma : pd.DataFrame
Input data frame
df2 = bibliography : pd.DataFrame
Input data frame
df3 = neotoma_joined_df :pd.DataFrame
Input data frame
path: location where to write csv file
Returns
-------
pd.DataFrame with articles that have no sitenames in the Neotoma Database
csv file in the output path with the mentioned DataFrame
"""
arts_wo_sites = nlp_bib_neotoma.groupby('_gddid')\
.agg({'found_sitenames':'sum'})\
.reset_index()

arts_wo_sites['found_sitenames'] = arts_wo_sites['found_sitenames'].apply(lambda x: list(set(x)))
arts_wo_sites = arts_wo_sites[arts_wo_sites['found_sitenames'].str.len() == 0]
arts_wo_sites = arts_wo_sites.merge(bibliography, how = 'inner')\
.merge(neotoma_joined_df, left_on = 'doi', right_on = 'doi')

arts_wo_sites = arts_wo_sites[['_gddid', 'title', 'year','found_sitenames', 'sitenames', 'doi', 'link_url']]
arts_wo_sites = arts_wo_sites.rename(columns = {'sitenames': 'exptected_sitename'})

# Output file
output_file = os.path.join(path,'articles_wo_sitename_intersections.tsv')
arts_wo_sites.to_csv(output_file, sep='\t', index = False)
print("A TSV file of Articles without Sites was created in your EDA output folder.")

return arts_wo_sites

0 comments on commit 2498824

Please sign in to comment.