from pandas import *
from ulmo.usgs import nwis
import numpy as np
import os

ref_file = 'ref_gauge_ids.txt'
data_dir = 'usgs_data_ulmo'
#cache_db = 'cache/usgs_cache.h5'

if not(os.path.exists(data_dir)):
    os.makedirs(data_dir)

ids = read_csv(ref_file, header=None, names=['site_id'], dtype={'site_id': np.character})

for site_id in ids['site_id']:

    data_file = os.path.join(data_dir, site_id + '.csv')

    if not(os.path.exists(data_file)):

        # the site may not exist
        try:
            # download and cache site data (this will take a long time the first time)
            # currently downloads all available parameters
            nwis.hdf5.update_site_data(site_id)
 
            # read daily mean discharge data from cache (statistics code 00003)
            data = nwis.hdf5.get_site_data(site_id, parameter_code='00060:00003')['00060:00003']
             
            # convert data to a pandas dataframe
            df = DataFrame(data['values']).drop(['last_checked','last_modified'], axis=1)
            df.value = df.value.apply(np.float)
            df.datetime = to_datetime(df.datetime)
             
            # mark bad data as NaN
            df[df.values == -999999] = np.nan

            # df.to_sql('flow', con, if_exists='append')
            df.to_csv(data_file, index=False)
        except:
            print 'Could not get data for site %s\n' % (site_id)