This script will demonstrate how to:
Start by creating folders to export the data to:
import os
# Create folders if they do not already exist
for folder in ['csv', 'html', 'json', 'mat', 'npy', 'ods', 'tab', 'tsv', 'txt', 'xls', 'xlsx']:
os.makedirs(folder, exist_ok=True)
Next, open the terminal and install the packages that will be needed
(replace python3.11
with the version of Python you have
installed and are using):
$ python3.11 -m pip install numpy
$ python3.11 -m pip install pandas
$ python3.11 -m pip install sqlite3
$ python3.11 -m pip install scipy
$ python3.11 -m pip install pyexcel_ods
$ python3.11 -m pip install xlwt
$ python3.11 -m pip install xlrd
$ python3.11 -m pip install xlsxwriter
$ python3.11 -m pip install openpyxl
Create a list:
ls = [1, 2, 3, 4, 5] + list('ABCDE') + list(range(5))
Create a list-of-lists:
ls_of_ls = [
[1, 'Bulbasaur', 'Grass', 'Poison'],
[2, 'Ivysaur', 'Grass', 'Poison'],
[3, 'Venusaur', 'Grass', 'Poison'],
[4, 'Charmander', 'Fire'],
[5, 'Charmeleon', 'Fire'],
[6, 'Charizard', 'Fire', 'Flying'],
[7, 'Squirtle', 'Water'],
[8, 'Wartortle', 'Water'],
[9, 'Blastoise', 'Water'],
]
Create a tuple:
tupl = (1, 2, 3, 4, 5)
Create a tuple of tuples:
tupl_of_tupl = (
(1, 2),
(3, 4),
(5, 6)
)
Import NumPy:
import numpy as np
Create an array:
array = np.arange(5)
Create an array-of-arrays:
array_of_arrays = np.array([
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
])
Create a string:
st = 'Hello, World!'
Create a multi-line string:
st_multi = """Paragraph 1:
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ut tellus vel nisi semper faucibus. Praesent ultricies
a urna a tincidunt. Curabitur nec posuere est, non convallis nibh. Donec interdum, lacus at ultrices faucibus, urna elit
mollis nunc, id cursus leo augue ut orci.
Paragraph 2:
Nulla maximus, quam ac semper lobortis, diam quam sodales diam, quis efficitur tortor tortor volutpat neque. Fusce
turpis libero, porttitor quis eros a, mollis lobortis eros. Pellentesque pellentesque augue consequat tortor vulputate
hendrerit.
"""
Create a string of comma-separated values:
st_csv = '30,40,50'
Create a string of tab-separated values:
st_tsv = '30\t40\t50'
Create a string of space-separated values:
st_ssv = '30 40 50'
Create a multi-line string of space-separated values:
st_multi_ssv = """
Year NDay Pos0 Pos1 Pos2 Pos3 Pos4
2015 4 8 11 13 14 18
2016 4 18 18 19 18 17
2017 4 17 20 25 26 27
2018 4 27 26 26 26 25
2019 4 25 24 23 22 21
"""
Create a dictionary:
dct = {
'Fibonacci': [0, 1, 1, 2, 3, 5, 8, 13],
'Primes': [2, 3, 5, 7, 11, 13, 17, 19]
}
Import Pandas:
import pandas as pd
Create a series:
ser = pd.Series(['Alpha', 'Bravo', 'Charlie', 'Delta', 'Echo'])
Create a data frame:
df = pd.DataFrame({
'Name': ['Charles Babbage', 'Ada Lovelace', 'Alan Turing', 'Mary Somerville', 'John Conway', 'Katherine Johnson'],
'Age': [79, 36, 41, 91, 82, 101],
'DOB': ['1791-12-26', '1815-12-10', '1912-06-23', '1780-12-26', '1937-12-26', '1918-08-26'],
})
Create an SQL database:
import sqlite3
dbase = sqlite3.connect(':memory:')
cur = dbase.cursor()
cur.execute('CREATE TABLE Example (Name text, DOB date, Age integer)')
cur.execute("INSERT INTO Example (Name, DOB, Age) VALUES ('Picasso', '1881-10-25', 91)")
cur.execute(f"INSERT INTO Example VALUES ('Lovelace', '1815-12-10', {1852 - 1815})")
Export a data frame as comma-separated values (.csv):
df.to_csv('csv/data frame.csv', index=False)
Export a data frame as tab-separated values (.tsv):
df.to_csv('tsv/data frame.tsv', index=False, sep='\t')
Export a data frame as tab-separated values (.tab):
df.to_csv('tab/data frame.tab', index=False, sep='\t')
Export a data frame as an Excel 97-2004 workbook (.xls):
import xlwt
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1')
for col_num in range(df.shape[1]):
# Write column headers
ws.write(0, col_num, list(df)[col_num])
for row_num in range(df.shape[0]):
# Write the data frame's content
ws.write(row_num + 1, col_num, str(df.iloc[row_num, col_num]))
wb.save('xls/data frame.xls')
Export a data frame as an Excel 2007 workbook (.xlsx):
df.to_excel('xlsx/data frame.xlsx')
Export multiple data frames as individual sheets of an Excel 2007 workbook (.xlsx):
with pd.ExcelWriter('xlsx/multiple data frames.xlsx') as writer:
df.to_excel(writer, sheet_name='Sheet_1')
df.to_excel(writer, sheet_name='Sheet_2')
Export a data frame as space-separated values in a text file (.txt):
import numpy as np
np.savetxt('txt/data frame - space-separated values.txt', df, fmt='%s')
Export a data frame as comma-separated values in a text file (.txt):
np.savetxt('txt/data frame - comma-separated values.txt', df, fmt='%s', delimiter=',')
Export a data frame as tab-separated values in a text file (.txt):
output = [df.columns.tolist()] + df.values.tolist()
np.savetxt('txt/data frame - tab-separated values.txt', output, fmt='%s', delimiter='\t')
dct = {
'Fibonacci': [0, 1, 1, 2, 3, 5, 8, 13],
'Primes': [2, 3, 5, 7, 11, 13, 17, 19]
}
Export a dictionary as a MATLAB object (.mat):
import scipy.io
filepath = 'mat/dictionary.mat'
scipy.io.savemat(filepath, dct, oned_as='row')
Export a dictionary as a NumPy object (.npy):
import numpy as np
filepath = 'npy/dictionary.npy'
np.save(filepath, dct)
Export a dictionary as comma-separated values (.csv) in four different ways:
import csv
filepath = 'csv/dictionary - multiple columns, single cell in each.csv'
with open(filepath, 'w') as file:
w = csv.DictWriter(file, dct.keys())
w.writeheader()
w.writerow(dct)
filepath = 'csv/dictionary - multiple columns, multiple cells in each.csv'
with open(filepath, 'w') as file:
w = csv.writer(file)
w.writerow(dct.keys())
for i in range(len(list(dct.values())[0])):
w.writerow([ls[i] for ls in list(dct.values())])
filepath = 'csv/dictionary - multiple rows, single cell in each.csv'
with open(filepath, 'w') as file:
w = csv.writer(file)
for key in list(dct.keys()):
w.writerow([key, str(dct[key])])
filepath = 'csv/dictionary - multiple rows, multiple cells in each.csv'
with open(filepath, 'w') as file:
w = csv.writer(file)
for key in list(dct.keys()):
w.writerow([key] + dct[key])
Export a dictionary as tab-separated values (.tsv) in four different ways:
import csv
filepath = 'tsv/dictionary - multiple columns, single cell in each.tsv'
with open(filepath, 'w') as file:
w = csv.DictWriter(file, dct.keys(), delimiter='\t')
w.writeheader()
w.writerow(dct)
filepath = 'tsv/dictionary - multiple columns, multiple cells in each.tsv'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
w.writerow(dct.keys())
for i in range(len(list(dct.values())[0])):
w.writerow([ls[i] for ls in list(dct.values())])
filepath = 'tsv/dictionary - multiple rows, single cell in each.tsv'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
for key in list(dct.keys()):
w.writerow([key, str(dct[key])])
filepath = 'tsv/dictionary - multiple rows, multiple cells in each.tsv'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
for key in list(dct.keys()):
w.writerow([key] + dct[key])
Export a dictionary as tab-separated values (.tab) in four different ways:
import csv
filepath = 'tab/dictionary - multiple columns, single cell in each.tab'
with open(filepath, 'w') as file:
w = csv.DictWriter(file, dct.keys(), delimiter='\t')
w.writeheader()
w.writerow(dct)
filepath = 'tab/dictionary - multiple columns, multiple cells in each.tab'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
w.writerow(dct.keys())
for i in range(len(list(dct.values())[0])):
w.writerow([ls[i] for ls in list(dct.values())])
filepath = 'tab/dictionary - multiple rows, single cell in each.tab'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
for key in list(dct.keys()):
w.writerow([key, str(dct[key])])
filepath = 'tab/dictionary - multiple rows, multiple cells in each.tab'
with open(filepath, 'w') as file:
w = csv.writer(file, delimiter='\t')
for key in list(dct.keys()):
w.writerow([key] + dct[key])
Export a dictionary to a text file (.txt):
import json
filepath = 'txt/dictionary.txt'
json.dump(dct, open(filepath, 'w'))
Export a dictionary to a JSON file (.json):
import json
filepath = 'json/dictionary.json'
with open(filepath, 'w') as file:
json.dump(dct, file)
Export list to comma-separated values (.csv):
import numpy as np
np.savetxt('csv/list.csv', ls, delimiter=',', fmt='%s')
Export list to tab-separated values (.tsv):
np.savetxt('tsv/list.tsv', ls, delimiter='\t', fmt='%s')
Export list to tab-separated values (.tab):
np.savetxt('tab/list.tab', ls, delimiter='\t', fmt='%s')
Export list to open document spreadsheet (.ods):
import pyexcel_ods
pyexcel_ods.save_data('ods/list.ods', {'Sheet 1': [ls]})
Export list to Excel 97-2004 workbook (.xls):
import xlwt
book = xlwt.Workbook()
sheet1 = book.add_sheet('sheet1')
for i, v in enumerate(ls):
sheet1.write(i, 0, v)
book.save('xls/list.xls')
Export list to text file (.txt) as a single line:
with open('txt/list - single line.txt', 'w') as file:
file.write(str(ls))
Export list to text file (.txt) with each element on a new line:
with open('txt/list - multiple lines.txt', 'w') as file:
for v in ls:
file.write(str(v) + '\n')
Export list to HTML file (.html):
file = open('html/list.html', 'w')
for item in ls:
file.write(str(item) + '\n')
file.close()
Export list to JavaScript Object Notation (JSON) file (.json):
import json
with open('json/list.json', 'w') as file:
json.dump(ls, file)
Export list-of-lists to text file (.txt):
with open('txt/list-of-lists.txt', 'w') as file:
json.dump(ls_of_ls, file)
Export list-of-lists to comma-separated values (.csv):
with open('csv/list-of-lists.csv', 'w', newline='') as file:
a = csv.writer(file, delimiter=',')
a.writerows(ls_of_ls)
Export list-of-lists to comma-separated values (.csv) (one-liner):
csv.writer(open('csv/list-of-lists.csv', 'w')).writerows(ls_of_ls)
Export list-of-lists to text file (.txt) as comma-separated values (csv):
with open('txt/list-of-lists - comma-separated values.txt', 'w', newline='') as file:
a = csv.writer(file, delimiter=',')
a.writerows(ls_of_ls)
Export list-of-lists to tab-separated values file (.tsv):
with open('tsv/list-of-lists.tsv', 'w', newline='') as file:
a = csv.writer(file, delimiter='\t')
a.writerows(ls_of_ls)
Export list-of-lists to tab-separated values file (.tab):
with open('tab/list-of-lists.tab', 'w', newline='') as file:
a = csv.writer(file, delimiter='\t')
a.writerows(ls_of_ls)
Export list-of-lists to text file (.txt) as tab-separated values (tsv):
with open('txt/list-of-lists - tab-separated values.txt', 'w', newline='') as file:
csv.writer(file, delimiter='\t').writerows(ls_of_ls)
Export list-of-lists to JavaScript Object Notation (JSON) file (.json):
import json
with open('json/list-of-lists.json', 'w') as file:
json.dump(ls_of_ls, file)
Export list-of-lists to sheets of an Excel 97-2004 workbook (.xls):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1')
for i, row in enumerate(ls_of_ls):
for j, cell in enumerate(row):
ws.write(i, j, cell)
ws.col(0).width = 256 * max([len(row[0:]) for row in ls_of_ls])
ws = wb.add_sheet('Sheet2')
for i, row in enumerate(reversed(ls_of_ls)):
for j, cell in enumerate(row):
ws.write(i, j, cell)
ws.col(0).width = 256 * max([len(row[0:]) for row in ls_of_ls])
wb.save('xls/list-of-lists.xls')
Export list-of-lists to sheets of an Excel 2007 workbook (.xlsx):
import xlsxwriter
wb = xlsxwriter.Workbook('xlsx/list-of-lists.xlsx')
ws = wb.add_worksheet('Sheet1')
for i, row in enumerate(ls_of_ls):
for j, col in enumerate(row):
ws.write(i, j, col)
ws = wb.add_worksheet('Sheet2')
for i, row in enumerate(reversed(ls_of_ls)):
for j, col in enumerate(row):
ws.write(i, j, col)
wb.close()
Export list-of-lists to open document spreadsheet (.ods):
data = {
'Sheet 1': ls_of_ls,
'Sheet 2': reversed(ls_of_ls)
}
pyexcel_ods.save_data('ods/list-of-lists.ods', data)
Export string to text file - one line at a time:
lines = [y for y in st_multi.splitlines()]
file = open('txt/string - one line at a time.txt', 'w')
for line in lines:
file.write(str(line) + '\n')
file.close()
Export string to text file - all together:
file = open('txt/string - all together.txt', 'w')
file.write(st_multi)
file.close()
Export SQL table to a spreadsheet:
def export_table(tablename, filename):
"""
Export an SQL table in a specified format.
SQLite does not support information schema so you need to use pragma.
"""
# Export table to comma-separated values (.csv)
if filename.endswith('csv'):
w = csv.writer(open(filename, 'w'))
cur.execute(f'PRAGMA table_info ({tablename})')
w.writerow([x[1] for x in cur.fetchall()])
cur.execute(f'SELECT * FROM {tablename}')
w.writerows(cur.fetchall())
# Export table to Excel 97-2004 workbook (.xls)
elif filename.endswith('xls'):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet 1')
cur.execute(f'PRAGMA table_info ({tablename})')
for col_num, col_details in enumerate(cur.fetchall()):
ws.write(0, col_num, col_details[1])
cur.execute(f'SELECT * FROM {tablename}')
for i, row in enumerate(cur.fetchall()):
for j, cell in enumerate(row):
ws.write(i + 1, j, cell)
wb.save(filename)
# Export table to Excel 2007 workbook (.xlsx)
elif filename.endswith('xlsx'):
wb = xlsxwriter.Workbook(filename)
ws = wb.add_worksheet('Sheet 1')
cur.execute(f'PRAGMA table_info ({tablename})')
for col_num, col_details in enumerate(cur.fetchall()):
ws.write(0, col_num, col_details[1])
cur.execute(f'SELECT * FROM {tablename}')
for i, row in enumerate(cur.fetchall()):
for j, cell in enumerate(row):
ws.write(i + 1, j, cell)
wb.close()
# Export table to open document spreadsheet (.ods)
elif filename.endswith('ods'):
cur.execute(f'PRAGMA table_info ({tablename})')
cols = [x[1] for x in cur.fetchall()]
cur.execute(f'SELECT * FROM {tablename}')
out = [cols] + [[y for y in x] for x in cur.fetchall()]
data = {'Sheet 1': out}
pyexcel_ods.save_data(filename, data)
# Exception handling
else:
ext = filename.split('.')[-1]
raise ValueError(
f'Extension "{ext}" is not recognised; ' +
'please use csv, xls, xlsx or ods files'
)
export_table('Example', 'csv/SQL table.csv')
export_table('Example', 'xls/SQL table.xls')
export_table('Example', 'xlsx/SQL table.xlsx')
export_table('Example', 'ods/SQL table.ods')
CSV file to Pandas data frame
Use read_csv
to import comma-separated values in a CSV
file (with the .csv extension) into Python as a Pandas data frame:
import pandas as pd
df = pd.read_csv('csv/data frame.csv')
To import all CSV files in a folder at once:
for file in os.listdir('csv'):
filepath = os.path.join('csv', file)
df = pd.read_csv(filepath)
The default behaviour of read_csv
will be to use the
first row as the column names and the numbers 0, 1, 2, etc, as the
indexes (row names). If you instead want to use the values in the first
column as the indexes (row names), use index_col=0
:
df = pd.read_csv('csv/data frame.csv', index_col=0)
If there are no column names (ie if the values in the first row are
not the names of the columns) use header=None
and the
columns will instead be given the numbers 0, 1, 2, etc as their
names:
df = pd.read_csv('csv/data frame.csv', header=None)
If there are no column names but you want to provide your own, use
header=None
and also names
to define them
manually:
df = pd.read_csv('csv/data frame.csv', header=None, names=['col1', 'col2', 'col3'])
To customise which values get interpreted as missing data, use the
na_values
keyword argument:
df = pd.read_csv('csv/data frame.csv', na_values=['Missing', 'None'])
In addition to whatever you manually define using
na_values
as demonstrated above, the following will all be
considered missing data: ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’,
‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘
To use a custom format for interpreting dates you need both
parse_dates
and date_format
as shown
below:
# Example date formats:
# '%Y-%m-%d %H:%M:%S' 2000-01-02 17:01:02
# '%Y-%m-%d' 2000-01-02
# '%d/%m/%Y' 02/01/2000
# '%d %b %Y %H:%M:%S' 2 Jan 2000
df = pd.read_csv('csv/data frame.csv', parse_dates=['DOB'], date_format='%Y-%m-%d')
CSV file to dictionary
dct = {}
with open('csv/dictionary - multiple columns, single cell in each.csv', newline='') as file:
reader = csv.DictReader(file)
for row in reader:
dct = dict(row)
CSV file to list-of-lists
ls_of_ls = []
with open('csv/list-of-lists.csv', newline='') as file:
for row in csv.reader(file):
ls_of_ls.append(row)
CSV file to NumPy array
ar = np.genfromtxt('csv/array.csv', delimiter=',')
Comma-separated values in a text file to a tuple of lists of strings
tupl_of_lists = ()
with open('txt/list-of-lists - comma-separated values.txt', newline='') as file:
# Replace empty cells with blanks
for row in csv.reader((line.replace('\0', '') for line in file)):
tupl_of_lists = tuple(list(tupl_of_lists) + [row])
Import tab-separated values in a TSV file (with the .tsv extension)
as a Pandas data frame using read_csv
with the
sep='\t'
option:
df = pd.read_csv('tsv/data frame.tsv', sep='\t')
To import as a list:
ls = []
with open('tsv/list.tsv') as file:
for line in file:
ls.append(line[:-1])
Import tab-separated values in a text file as a Pandas data frame:
df = pd.read_csv('txt/data frame - tab-separated values.txt', sep='\t')
Import tab-separated values in a text file as a tuple of lists of strings:
tupl_of_lists = []
with open('txt/list-of-lists - tab-separated values.txt', newline='') as file:
for row in csv.reader(file, delimiter='\t'):
tupl_of_lists.append(row)
Import tab-separated values in a TAB file (with the .tab extension)
as a Pandas data frame using read_csv
with the
sep='\t'
option:
df = pd.read_csv('tab/data frame.tab', sep='\t')
Import MATLAB data files (.mat) as a dictionary:
import scipy.io
dct = scipy.io.loadmat('mat/dictionary.mat')
Import text file (.txt) as a string:
file = open('txt/string - all together.txt', 'r')
st = file.read()
Convert that string into a list of lines:
# Break the string up on the newline characters
ls = st.split('\n')
Import text file (.txt) as a list of lines (this is equivalent to the
above, it’s just a different way of using the output of the
open()
function):
ls = []
with open('txt/string - all together.txt') as file:
for line in file:
# Remove newline characters
ls.append(line.strip('\n'))
Import text file (.txt) as a Pandas series:
import pandas as pd
ser = pd.read_csv('txt/series.txt', header=None)
The json
module allows text stored in plain text files
to be interpreted exactly as Python code:
Import a list-of-lists stored in a text file (.txt) as a list-of-lists:
import json
with open('txt/list-of-lists.txt', 'r') as file:
ls = json.load(file)
# or
ls = json.load(open('txt/list-of-lists.txt'))
Import a dictionary stored in a text file (.txt) as a dictionary:
import json
dct = json.load(open('txt/dictionary.txt'))
These last two examples are identical except for the fact that the contents of ‘list-of-lists.txt’ forms a list-of-lists when interpreted as Python code whereas the contents of ‘dictionary.txt’ forms a dictionary.
Import NumPy object (.npy) as a dictionary:
import numpy as np
dct = np.load('npy/dictionary.npy', allow_pickle=True).item()
Import HTML as a list:
ls = []
for line in open('html/list.html', 'r', encoding='utf8'):
# Remove the newline character from the end of each line
ls.append(line[:-1])
Import JSON as a dictionary:
import json
with open('json/dictionary.json') as file:
dct = json.load(file)
Import open document spreadsheet (.ods) as a list-of-lists:
def import_ods(filename):
from pyexcel_ods import get_data
data = []
wb = get_data(filename)
# wb is an OrderedDict where each key is a sheetname
for sheetname in list(wb.keys()):
sheet = wb[sheetname]
data += [sheet]
return data
ls_of_ls = import_ods('ods/list-of-lists.ods')
Import Excel 97-2004 workbook (.xls) as a list-of-lists:
def import_xls(filename):
"""Import Excel 97-2004 workbook (.xls) as a list-of-lists."""
import xlrd
data = []
wb = xlrd.open_workbook(filename)
sh1 = wb.sheet_by_index(0)
for rownum in range(sh1.nrows):
data += [sh1.row_values(rownum)]
return data
ls_of_ls = import_xls('xls/list-of-lists.xls')
Import Excel 97-2004 workbook (.xls) as a data frame (this is done with Pandas but the xlrd package is also required as it’s a dependency):
import pandas as pd
df = pd.read_excel('xls/data frame.xls', index_col=0)
Import an Excel 2007 workbook (.xlsx) as a list-of-lists:
def import_xlsx(filename):
"""Import Excel 2007 workbook (.xlsx) as a list-of-lists."""
import openpyxl
data = []
wb = openpyxl.load_workbook(filename)
for sheetname in wb.sheetnames:
sh = wb[sheetname]
for row in sh.values:
data += [list(row)]
return data
ls_of_ls = import_xlsx('xlsx/list-of-lists.xlsx')
Import an Excel 2007 workbook (.xlsx) as a data frame. If there are multiple sheets in the workbook, only the first will be imported. The first row will be used as the column names and the numbers 0, 1, 2, etc, will be used as the indexes (row names):
import pandas as pd
df = pd.read_excel('xlsx/data frame.xlsx')
If you instead want to use the values in the first column as the
indexes (row names), use index_col=0
:
df = pd.read_excel('xlsx/data frame.xlsx', index_col=0)
If there are no column names (ie if the values in the first row are
not the names of the columns) use header=None
and the
columns will instead be given the numbers 0, 1, 2, etc as their
names:
df = pd.read_excel('xlsx/data frame.xlsx', header=None)
If there are no column names but you want to provide your own, use
header=None
and also names
to define them
manually:
df = pd.read_excel('xlsx/data frame.xlsx', header=None, names=['col1', 'col2', 'col3'])
If you want to import a sheet that isn’t the first, use
sheet_name
and then either the position of the sheet as a
number or the name of the sheet as a string:
df = pd.read_excel('xlsx/data frame.xlsx', sheet_name=0)
df = pd.read_excel('xlsx/data frame.xlsx', sheet_name='Sheet1')
Import spreadsheet as a list-of-lists:
def import_spreadsheet_list(filename):
"""Import a spreadsheet and convert it to a list-of-lists."""
data = []
if filename.endswith('csv'):
import csv
data = []
reader = csv.reader(open(filename, 'r'))
for row in reader:
# Be able to handle empty cells
for i, v in enumerate(row):
if v == '':
row[i] = 'None'
data.append(row)
elif filename.endswith('xls'):
import xlrd
wb = xlrd.open_workbook(filename)
for i, sheet_name in enumerate(wb.sheet_names()):
sh = wb.sheet_by_name(sheet_name)
sheet = []
for rownum in range(sh.nrows):
sheet += [sh.row_values(rownum)]
data += [sheet]
elif filename.endswith('xlsx'):
from openpyxl import load_workbook
try:
wb = load_workbook(filename)
except IOError:
raise IOError(f'The file "{filename}" was not found!')
for sh in wb.worksheets:
sheet = []
for row in sh.iter_rows():
data_row = []
for cell in row:
data_row += [cell.internal_value]
sheet += [data_row]
data += [sheet]
elif filename.endswith('ods'):
from pyexcel_ods import get_data
wb = get_data(filename)
# wb is an OrderedDict where each key is a sheetname
for sheetname in list(wb.keys()):
sheet = wb[sheetname]
data += [sheet]
return data
sheet1 = import_spreadsheet_list('csv/list-of-lists.csv') # A CSV can only have one sheet
[sheet1, sheet2] = import_spreadsheet_list('xls/list-of-lists.xls')
[sheet1, sheet2] = import_spreadsheet_list('xlsx/list-of-lists.xlsx')
[sheet1, sheet2] = import_spreadsheet_list('ods/list-of-lists.ods')
Import spreadsheet as a dictionary of sheets which are lists-of-lists:
def import_spreadsheet_dict(filename):
"""Import a spreadsheet and convert it to a list-of-lists."""
data = {}
if filename.endswith('csv'):
import csv
data = []
reader = csv.reader(open(filename, 'r'))
for row in reader:
# Be able to handle empty cells
for i, v in enumerate(row):
if v == '':
row[i] = 'None'
data.append(row)
elif filename.endswith('xls'):
import xlrd
wb = xlrd.open_workbook(filename)
for i, sheet_name in enumerate(wb.sheet_names()):
sh = wb.sheet_by_name(sheet_name)
sheet = []
for rownum in range(sh.nrows):
sheet += [sh.row_values(rownum)]
data[sheet_name] = sheet
elif filename.endswith('xlsx'):
from openpyxl import load_workbook
wb = load_workbook(filename)
for i, sh in enumerate(wb.worksheets):
sheet_name = wb.sheetnames[i]
sheet = []
for row in sh.iter_rows():
data_row = []
for cell in row:
data_row += [cell.internal_value]
sheet += [data_row]
data[sheet_name] = sheet
elif filename.endswith('ods'):
from pyexcel_ods import get_data
wb = get_data(filename)
# wb is an OrderedDict where each key is a sheetname
for sheetname in list(wb.keys()):
sheet = wb[sheetname]
data[sheetname] = sheet
return data
ls_of_ls = import_spreadsheet_dict('csv/list-of-lists.csv') # A CSV can only have one sheet
dct = import_spreadsheet_dict('xls/list-of-lists.xls')
dct = import_spreadsheet_dict('xlsx/list-of-lists.xlsx')
dct = import_spreadsheet_dict('ods/list-of-lists.ods')
Convert a data frame to a dictionary:
import pandas as pd
# Create a data frame
df = pd.DataFrame({
'Name': ['Charles Babbage', 'Ada Lovelace', 'Alan Turing', 'Mary Somerville', 'John Conway', 'Katherine Johnson'],
'Age': [79, 36, 41, 91, 82, 101],
'DOB': ['1791-12-26', '1815-12-10', '1912-06-23', '1780-12-26', '1937-12-26', '1918-08-26'],
})
# Convert to a dictionary
dct = df.to_dict()
print(dct)
## {'Name': {0: 'Charles Babbage', 1: 'Ada Lovelace', 2: 'Alan Turing', 3: 'Mary Somerville', 4: 'John Conway', 5: 'Katherine Johnson'}, 'Age': {0: 79, 1: 36, 2: 41, 3: 91, 4: 82, 5: 101}, 'DOB': {0: '1791-12-26', 1: '1815-12-10', 2: '1912-06-23', 3: '1780-12-26', 4: '1937-12-26', 5: '1918-08-26'}}
Convert a data frame to a list-of-lists:
ls_of_ls = df.values.tolist()
# Add the column headings to the start of the list-of-lists
ls_of_ls = [df.columns.tolist()] + ls_of_ls
print(ls_of_ls)
## [['Name', 'Age', 'DOB'], ['Charles Babbage', 79, '1791-12-26'], ['Ada Lovelace', 36, '1815-12-10'], ['Alan Turing', 41, '1912-06-23'], ['Mary Somerville', 91, '1780-12-26'], ['John Conway', 82, '1937-12-26'], ['Katherine Johnson', 101, '1918-08-26']]
Convert a series to a list:
# Create a series
ser = pd.Series(['Alpha', 'Bravo', 'Charlie', 'Delta', 'Echo'])
# Convert to a list
ls = list(ser)
print(ls)
## ['Alpha', 'Bravo', 'Charlie', 'Delta', 'Echo']
Convert a list-of-lists to a dictionary (using row headers as keys):
# Create a list-of-lists
ls_of_ls = [
['Name', 'Charles Babbage', 'Ada Lovelace', 'Alan Turing'],
['Age', 79, 36, 41],
['DOB', '1791-12-26', '1815-12-10', '1912-06-23'],
]
# Convert to a dictionary
dct = {rows[0]: rows[1:] for rows in ls_of_ls}
print(dct['Name'])
## ['Charles Babbage', 'Ada Lovelace', 'Alan Turing']
Convert a list-of-lists to a dictionary for each row (using row headers as keys):
for column_idx, column_value in enumerate(ls_of_ls[0]):
# Create a dictionary for each element in the first row
globals()[str(column_value).replace(' ', '_')] = {}
# For each row, add each column's information to its dictionary
for row in ls_of_ls[1:]:
# For each row from the second onwards
for column_idx, column_value in enumerate(ls_of_ls[0]):
# For each element in the first row
try:
globals()[str(column_value).replace(' ', '_')][row[0]] = row[column_idx]
# Sometimes, not all cells have values
# These blank ones need to be passed
except ValueError:
pass
print(Charles_Babbage['Age'])
## 79
Convert a list-of-lists to a data frame (using row headers as column names):
import pandas as pd
import numpy as np
# Convert to a data frame
colnames = [row[0] for row in ls_of_ls]
data = np.transpose([row[1:] for row in ls_of_ls])
df = pd.DataFrame(data, columns=colnames)
print(df)
## Name Age DOB
## 0 Charles Babbage 79 1791-12-26
## 1 Ada Lovelace 36 1815-12-10
## 2 Alan Turing 41 1912-06-23
Convert a list-of-lists to a dictionary (using column headers as keys):
# Create a list-of-lists
ls_of_ls = [
['Name', 'Age', 'DOB'],
['Charles Babbage', 79, '1791-12-26'],
['Ada Lovelace', 36, '1815-12-10'],
['Alan Turing', 41, '1912-06-23'],
]
# Convert to a dictionary
dct = {}
for i, colname in enumerate(ls_of_ls[0]):
dct[colname] = [row[i] for row in ls_of_ls[1:]]
print(dct['Name'])
## ['Charles Babbage', 'Ada Lovelace', 'Alan Turing']
Convert a list-of-lists to a dictionary for each row (using column headers as keys):
# Convert to a dictionary
for row in ls_of_ls[1:]:
# Create a dictionary for each row from the second onwards
globals()[str(row[0]).replace(' ', '_')] = {}
# For each column, add each row's information to its dictionary
for column_idx, column_value in enumerate(ls_of_ls[0]):
# For each element in the first row
for row in ls_of_ls[1:]:
# For each row from the second onwards
globals()[str(row[0]).replace(' ', '_')][column_value] = row[column_idx]
print(Charles_Babbage['Age'])
## 79
Convert a list-of-lists to a data frame (using column headers as column names):
import pandas as pd
# Convert to a data frame
df = pd.DataFrame(ls_of_ls[1:], columns=ls_of_ls[0])
print(df)
## Name Age DOB
## 0 Charles Babbage 79 1791-12-26
## 1 Ada Lovelace 36 1815-12-10
## 2 Alan Turing 41 1912-06-23
Convert a list to an array:
import numpy as np
# Create a list
ls = [2, 3, 5, 7, 11]
# Convert to an array
ar = np.array(ls)
Convert a list of numbers to a list of strings:
ls_of_str = [str(x) for x in ls]
Convert a list-of-lists to an array-of-arrays:
# Create a list-of-lists
ls_of_ls = [[1, 2, 3], [10, 20, 30], [100, 200, 300]]
ar = np.array(ls_of_ls)
Convert a list-of-lists of numbers to a list-of-lists of strings:
ls_of_ls_str = [[str(x) for x in ls] for ls in ls_of_ls]
Convert a list of strings to a list of floating-point numbers:
ls = [float(x) for x in ls_of_str]
print(ls)
## [2.0, 3.0, 5.0, 7.0, 11.0]
Convert a list of strings to a list of integer numbers:
ls = [int(x) for x in ls_of_str]
print(ls)
## [2, 3, 5, 7, 11]
Convert a list-of-lists of strings to a list-of-lists of floating-point numbers:
ls_of_ls = [[float(j) for j in i] for i in ls_of_ls_str]
print(ls_of_ls)
## [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0], [100.0, 200.0, 300.0]]
Convert a list-of-lists of strings to a list-of-lists of integer numbers:
ls_of_ls = [[int(j) for j in i] for i in ls_of_ls_str]
print(ls_of_ls)
## [[1, 2, 3], [10, 20, 30], [100, 200, 300]]
Convert a list of strings to a dictionary:
# Create a list of strings
ls = [
'the village', 'das Dorf',
'the bread', 'das Brot',
'the cheese', 'der Käse'
]
# Convert to a dictionary
dct = {}
for i in range(len(ls)):
if i % 2 == 0:
dct[ls[i]] = ls[i + 1]
print(dct['the bread'])
## das Brot
Convert an array to a list:
import numpy as np
# Create an array
ar = np.arange(5)
# Convert to a list
ls = list(ar)
print(ls)
## [0, 1, 2, 3, 4]
Convert an array-of-arrays to a list-of-lists:
# Create an array-of-arrays
ar_of_ars = np.array([
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
[1, 2, 3, 4, 5],
])
# Convert to a list-of-lists
ls_of_ls = [list(a) for a in ar_of_ars]
print(ls_of_ls)
## [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]
Convert a tuple to a list:
# Create a tuple
tpl = (1, 2, 3, 4, 5)
# Convert to a list
ls = list(tpl)
print(ls)
## [1, 2, 3, 4, 5]
Convert a tuple-of-tuples to a list-of-lists:
# Create a tuple-of-tuples
tpl_of_tpls = (
(1, 2),
(3, 4),
(5, 6)
)
# Convert to a list-of-lists
ls_of_ls = [list(i) for i in tpl_of_tpls]
print(ls_of_ls)
## [[1, 2], [3, 4], [5, 6]]
Convert multi-line text to a list, removing empty lines:
# Create a multi-line string
st_multi = """Line 1
Line 2
Line 3
Line 4
"""
# Convert to a list
ls = [y for y in (x.strip() for x in st_multi.splitlines()) if y]
print(ls)
## ['Line 1', 'Line 2', 'Line 3', 'Line 4']
Convert multi-line text to a list, keeping empty lines:
# Convert to a list
ls = [y for y in st_multi.splitlines()]
print(ls)
## ['Line 1', 'Line 2', '', 'Line 3', 'Line 4']
Convert multi-line text to a dictionary:
# Convert to a list
ls = [y for y in (x.strip() for x in st_multi.splitlines()) if y]
# Convert to a dictionary
dct = {}
for i in range(len(ls)):
if i % 2 == 0:
dct[ls[i]] = ls[i + 1]
print(dct['Line 1'])
## Line 2
Convert a string of space-separated values to a list of numbers:
# Create a string of space-separated values
st_ssv = '30 40 50'
# Convert to a list
ls = [int(x) for x in st_ssv.split(' ')]
print(ls)
## [30, 40, 50]
Convert text formatted as a table to a data frame:
import pandas as pd
from io import StringIO
# Create a multi-line space-separated values:
st_multi_ssv = """
Year NDay Pos0 Pos1 Pos2 Pos3 Pos4
2023 4 8 11 13 14 18
2024 4 18 18 19 18 17
2025 4 17 20 25 26 27
"""
# Remove duplicate whitespace and convert to list of strings
text = [' '.join(row.split()) for row in st_multi_ssv.split('\n')]
# Remove first and last rows
text = text[1:-1]
# Add line end characters back in
text = [row + '\n' for row in text]
# Collapse to string
text = ''.join(text)
df = pd.read_csv(StringIO(text), sep=' ')
print(df)
## Year NDay Pos0 Pos1 Pos2 Pos3 Pos4
## 0 2023 4 8 11 13 14 18
## 1 2024 4 18 18 19 18 17
## 2 2025 4 17 20 25 26 27