KeyError: 'response' #3

anshulkansal71 · 2018-10-03T20:19:48Z

import numpy as np
import csv, json
import pandas as pd

################################################################################################

Preparing DJIA data

Reading DJIA index prices csv file

with open('E:/StockPredictions-master/Data/djiaUpdated.csv', 'rb') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
# Converting the csv file reader to a lists
data_list = list(spamreader)

Separating header from the data

header = data_list[0]
data_list = data_list[1:]

data_list = np.asarray(data_list)

Selecting date and close value for each day

selected_data = data_list[:, [0, 4, 6]]

df = pd.DataFrame(data=selected_data[0:,1:],
index=selected_data[0:,0],
columns=['close', 'adj close'],
dtype='float64')

Reference for pandas interpolation http://pandas.pydata.org/pandas-docs/stable/missing_data.html

Adding missing dates to the dataframe

df1 = df
idx = pd.date_range('12-29-2006', '12-31-2016')
df1.index = pd.DatetimeIndex(df1.index)
df1 = df1.reindex(idx, fill_value=np.NaN)

df1.count() # gives 2518 count

interpolated_df = df1.interpolate()
interpolated_df.count() # gives 3651 count

Removing extra date rows added in data for calculating interpolation

interpolated_df = interpolated_df[3:]

###############################################################################################

Preparing NYTimes data

Function to parse and convert date format

date_format = ["%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S+%f"]
def try_parsing_date(text):
for fmt in date_format:
#return datetime.strptime(text, fmt)
try:
return datetime.strptime(text, fmt).strftime('%Y-%m-%d')
except ValueError:
pass
raise ValueError('no valid date format found')

years = [2017,2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008]
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
dict_keys = ['pub_date', 'headline'] #, 'lead_paragraph']
articles_dict = dict.fromkeys(dict_keys)

Filtering list for type_of_material

type_of_material_list = ['blog', 'brief', 'news', 'editorial', 'op-ed', 'list','analysis']

Filtering list for section_name

section_name_list = ['business', 'national', 'world', 'u.s.' , 'politics', 'opinion', 'tech', 'science', 'health']
news_desk_list = ['business', 'national', 'world', 'u.s.' , 'politics', 'opinion', 'tech', 'science', 'health', 'foreign']

current_date = '2016-01-01'
from datetime import datetime
#years = [2015]
#months = [3]

current_article_str = ''

Adding article column to dataframe

interpolated_df["articles"] = ''
count_articles_filtered = 0
count_total_articles = 0
count_main_not_exist = 0
count_unicode_error = 0
count_attribute_error = 0
for year in years:
for month in months:
file_str = 'E:/StockPredictions-master/Data/nytimes/' + str(year) + '-' + '{:02}'.format(month) + '.json'
with open(file_str) as data_file:
NYTimes_data = json.load(data_file)
count_total_articles = count_total_articles + len(NYTimes_data['response']['docs'][:])
for i in range(len(NYTimes_data["response"]["docs"][:])):
try:
if any(substring in NYTimes_data["response"]["docs"][:][i]['type_of_material'].lower() for substring in type_of_material_list):
if any(substring in NYTimes_data["response"]["docs"][:][i]['section_name'].lower() for substring in section_name_list):
#count += 1
count_articles_filtered += 1
#print 'i: ' + str(i)
articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }
articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline
#articles_dict['headline'] = articles_dict['lead_paragraph'] # Selecting lead_paragraph
date = try_parsing_date(articles_dict['pub_date'])
#print 'article_dict: ' + articles_dict['headline']
if date == current_date:
current_article_str = current_article_str + '. ' + articles_dict['headline']
else:
interpolated_df.set_value(current_date, 'articles', interpolated_df.loc[current_date, 'articles'] + '. ' + current_article_str)
current_date = date
#interpolated_df.set_value(date, 'articles', current_article_str)
#print str(date) + current_article_str
current_article_str = articles_dict['headline']
# For last condition in a year
if (date == current_date) and (i == len(NYTimes_data["response"]["docs"][:]) - 1):
interpolated_df.set_value(date, 'articles', current_article_str)

         #Exception for section_name or type_of_material absent
        except AttributeError:
            #print 'attribute error'
            #print NYTimes_data["response"]["docs"][:][i]
            count_attribute_error += 1
            # If article matches news_desk_list if none section_name found
            try:
                if any(substring in NYTimes_data["response"]["docs"][:][i]['news_desk'].lower() for substring in news_desk_list):
                        #count += 1
                        count_articles_filtered += 1
                        #print 'i: ' + str(i)
                        articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }
                        articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline
                        #articles_dict['headline'] = articles_dict['lead_paragraph'] # Selecting lead_paragraph
                        date = try_parsing_date(articles_dict['pub_date'])
                        #print 'article_dict: ' + articles_dict['headline']
                        if date == current_date:
                            current_article_str = current_article_str + '. ' + articles_dict['headline']
                        else:  
                            interpolated_df.set_value(current_date, 'articles', interpolated_df.loc[current_date, 'articles'] + '. ' + current_article_str)
                            current_date = date
                            #interpolated_df.set_value(date, 'articles', current_article_str)
                            #print str(date) + current_article_str
                            current_article_str = articles_dict['headline']
                        # For last condition in a year
                        if (date == current_date) and (i == len(NYTimes_data["response"]["docs"][:]) - 1): 
                            interpolated_df.set_value(date, 'articles', current_article_str)   
            
            except AttributeError:
                pass
            pass
        except KeyError:
            print 'key error'
            #print NYTimes_data["response"]["docs"][:][i]
            count_main_not_exist += 1
            pass   
        except TypeError:
            print "type error"
            #print NYTimes_data["response"]["docs"][:][i]
            count_main_not_exist += 1
            pass

print count_articles_filtered
print count_total_articles
print count_main_not_exist
print count_unicode_error

Putting all articles if no section_name or news_desk not found

for date, row in interpolated_df.T.iteritems():
if len(interpolated_df.loc[date, 'articles']) <= 400:
#print interpolated_df.loc[date, 'articles']
#print date
month = date.month
year = date.year
file_str = 'E:/StockPredictions-master/Data/nytimes/' + str(year) + '-' + '{:02}'.format(month) + '.json'
with open(file_str) as data_file:
NYTimes_data = json.load(data_file)
count_total_articles = count_total_articles + len(NYTimes_data["response"]["docs"][:])
interpolated_df.set_value(date.strftime('%Y-%m-%d'), 'articles', '')
for i in range(len(NYTimes_data["response"]["docs"][:])):
try:

            articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }
            articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline
            #articles_dict['headline'] = articles_dict['lead_paragraph'] # Selecting lead_paragraph       
            pub_date = try_parsing_date(articles_dict['pub_date'])
            #print 'article_dict: ' + articles_dict['headline']
            if date.strftime('%Y-%m-%d') == pub_date: 
                interpolated_df.set_value(pub_date, 'articles', interpolated_df.loc[pub_date, 'articles'] + '. ' + articles_dict['headline'])  
            
        except KeyError:
            print 'key error'
            #print NYTimes_data["response"]["docs"][:][i]
            #count_main_not_exist += 1
            pass   
        except TypeError:
            print "type error"
            #print NYTimes_data["response"]["docs"][:][i]
            #count_main_not_exist += 1
            pass

#>>> print count_articles_filtered
#440770
#>>> print count_total_articles
#1073132

Filtering the whole data for a year

#filtered_data = interpolated_df.ix['2016-01-01':'2016-12-31']
#filtered_data.to_pickle('/Users/Dinesh/Documents/Project Stock predictions/data/pickled_ten_year_all.pkl')

Saving the data as pickle file

interpolated_df.to_pickle('E:/StockPredictions-master/Data//pickled_ten_year_filtered_lead_para.pkl')

Save pandas frame in csv form

interpolated_df.to_csv('E:/StockPredictions-master/Data/sample_interpolated_df_10_years_filtered_lead_para.csv',
sep='\t', encoding='utf-8')

Reading the data as pickle file

dataframe_read = pd.read_pickle('E:/StockPredictions-master/Data/pickled_ten_year_filtered_lead_para.pkl')

#################################################################################

Filtering rows

#filtered_data = interpolated_df.ix['2016-01-01':'2016-12-31']

Syntax for accessing the data

#NYTimes_data["response"]["docs"][1:2][:]['headline']['main']
#NYTimes_data["response"]["docs"][1:2][0]['pub_date']

articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }

try:

articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline

except KeyError:

count_main_not_exist += 1

pass

except TypeError:

count_main_not_exist += 1

pass

Find out articles with less number of articles

for date, row in interpolated_df.T.iteritems():

if len(interpolated_df.loc[date, 'articles']) < 300:

print interpolated_df.loc[date, 'articles']

print date

KeyError Traceback (most recent call last)
in ()
87 with open(file_str) as data_file:
88 NYTimes_data = json.load(data_file)
---> 89 count_total_articles = count_total_articles + len(NYTimes_data['response']['docs'][:])
90 for i in range(len(NYTimes_data["response"]["docs"][:])):
91 try:

KeyError: 'response'

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

KeyError: 'response' #3

KeyError: 'response' #3

anshulkansal71 commented Oct 3, 2018

KeyError: 'response' #3

KeyError: 'response' #3

Comments

anshulkansal71 commented Oct 3, 2018

Preparing DJIA data

Reading DJIA index prices csv file

Separating header from the data

Selecting date and close value for each day

Reference for pandas interpolation http://pandas.pydata.org/pandas-docs/stable/missing_data.html

Adding missing dates to the dataframe

df1.count() # gives 2518 count

Removing extra date rows added in data for calculating interpolation

Preparing NYTimes data

Function to parse and convert date format

Filtering list for type_of_material

Filtering list for section_name

Adding article column to dataframe

Putting all articles if no section_name or news_desk not found

Filtering the whole data for a year

Saving the data as pickle file

Save pandas frame in csv form

Reading the data as pickle file

Filtering rows

Syntax for accessing the data

articles_dict = { your_key: NYTimes_data["response"]["docs"][:][i][your_key] for your_key in dict_keys }

try:

articles_dict['headline'] = articles_dict['headline']['main'] # Selecting just 'main' from headline

except KeyError:

count_main_not_exist += 1

pass

except TypeError:

count_main_not_exist += 1

pass

Find out articles with less number of articles

for date, row in interpolated_df.T.iteritems():

if len(interpolated_df.loc[date, 'articles']) < 300:

print interpolated_df.loc[date, 'articles']

print date