// open the hdf5 file and the dataset file_id = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT); dataset_id = H5Dopen2(file_id, dataset_name, H5P_DEFAULT);
// get file dataspace dataspace_id = H5Dget_space(dataset_id);
// select subset from dataspace status = H5Sselect_hyperslab(dataspace_id, H5S_SELECT_SET, offset, stride, count, block); if (status < 0) { H5Eprint2(H5E_DEFAULT, NULL); return -1; }
status = H5Sselect_hyperslab(memspace_id, H5S_SELECT_SET, offset_mem, stride_mem, count_mem, block_mem); if (status < 0) { H5Eprint2(H5E_DEFAULT, NULL); return -1; }
// read data status = H5Dread(dataset_id, H5T_NATIVE_DOUBLE, memspace_id, dataspace_id, H5P_DEFAULT, buffer); if (status < 0) { H5Eprint2(H5E_DEFAULT, NULL); return -1; } }
// close the resources H5Sclose(memspace_id); H5Sclose(dataspace_id); H5Dclose(dataset_id); H5Fclose(file_id);
return 0; }
基本实现思路就是每次就只读取HDF5文件中矩阵的一行,然后写入到内存空间中,每次读取都要用函数 H5Sselect_hyperslab 函数设定读取区域,同样,每次写入都要使用函数 H5Sselect_hyperslab 设定写入区域。这个实现思想参考了HDF5官方教程 Reading From or Writing To a Subset of a Dataset,教程中仅仅设定了文件的读取区域,没有设定写入区域。
Python
这部分使用Cython来实现Python中调用前面所写的C语言代码,这部分代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13
def read_direct_fortran(str file_name, str dataset_name, double[::1, :] data, int offset_x, int offset_y): cdef int num_rows cdef int num_cols cdef bytes file_name_bytes = bytes(file_name, encoding='utf-8') cdef bytes dataset_name_bytes = bytes(dataset_name, encoding='utf-8') cdef int ret;
num_rows = data.shape[0] num_cols = data.shape[1]
ret = read_direct_f(file_name_bytes, dataset_name_bytes, &data[0, 0], num_rows, num_cols, offset_x, offset_y)
Given a 32-bit signed integer, reverse digits of an integer.
Example 1:
1 2
Input: 123 Output: 321
Example 2:
1 2
Input: -123 Output: -321
Example 3:
1 2
Input: 120 Output: 21
Note:
Assume we are dealing with an environment which could only store integers within the 32-bit signed integer range: \([−2^{31}, 2^{31} − 1]\). For the purpose of this problem, assume that your function returns 0 when the reversed integer overflows.
class Solution { public: int reverse(int x) { int mod; long long num; bool postive;
num = 0; postive = x >=0 ? true : false; while (x) { mod = x % 10; x = x / 10; num = num * 10 + mod; if ((postive && num > 0x7FFFFFFF) || (!postive && num < (signed int)0x80000000)) return 0; }
这里举一个例子来感受下这个近似公式是如何缩减大量计算量。一个经典的概率问题-Birthday Problem
Birthday Prolem
Calculate the probability p that at least two people in a group of k people will have the same birthday, that is, will have been born on the same day of the same month but not necessarily in the same year.
>>> obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) >>> obj a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64 >>> obj['b'] 1.0 >>> obj[1] 1.0 >>> obj[:2] a 0.0 b 1.0 dtype: float64 >>> obj[obj < 2] a 0.0 b 1.0 dtype: float64
# 如果使用 label 索引,则索引区间为闭区间 >>> obj['b':'c'] b 1.0 c 2.0 dtype: float64
5. DataFrame索引
先贴张DataFrame索引方法的表格,摘录自《Python for Data Analysis》。
Type
Notes
df[val]
Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values bases on some criterion)
df.loc[val]
Selects single row or subset of rows from the DataFrame by label
df.loc[:, val]
Selects single column or subset of columns by label
df.loc[val1, val2]
Select both rows and columns by label
df.iloc[where]
Selects single row of subsets of rows from the DataFrame by integer position
df.iloc[:, where]
Selects single column or subset of columns by integer position
df.iloc[where_i, where_j]
Select both rows and columns by integer position
df.at[label_i, label_j]
Select a single scalar value by row and column label
df.iat[i, j]
Select a single scalar value by row and column position (integers)
先创建一个数据
1 2 3 4 5 6 7 8 9
>>> data = pd.DataFrame(np.arange(16).reshape((4, 4)), ... index=['Ohio', 'Colorado', 'Utah', 'New York'], ... columns=['one', 'two', 'three', 'four']) >>> data one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 利用单个label选择单列 >>> data['two'] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64
# 利用多个label选择多列,可以改变列顺序 >>> data[['three', 'one']] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12
# 利用boolean数组选择多行 >>> bools = np.array([False, True, False, True]) >>> bools array([False, True, False, True]) >>> data[bools] one two three four Colorado 4 5 6 7 New York 12 13 14 15
# 利用切片(slice)选择多行,类似Numpy的语法 >>> data[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 利用boolean DataFrame选择数据 >>> data < 5 one two three four Ohio True True True True Colorado True False False False Utah False False False False New York False False False False >>> data[data < 5] = 0 >>> data one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 使用 df.loc[val] 选择行 >>> data.loc['Utah'] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.loc[['Utah', 'Ohio']] one two three four Utah 8 9 10 11 Ohio 0 1 2 3 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:'Utah'] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11
# 使用 df.loc[:, val] 选择列 >>> data.loc[:, 'one'] Ohio 0 Colorado 4 Utah 8 New York 12 Name: one, dtype: int64 >>> data.loc[:, ['one', 'two']] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:, :'two'] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.loc[val1, val2] 选择多行多列 >>> data.loc[['Colorado', 'Ohio'], ['two', 'three']] two three Colorado 5 6 Ohio 1 2
# 使用 df.iloc[where] 选择行 >>> data.iloc[2] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.iloc[[2,1]] one two three four Utah 8 9 10 11 Colorado 4 5 6 7 >>> data.iloc[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 使用 df.iloc[:, where] 选择列 >>> data.iloc[:, 1] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64 >>> data.iloc[:, [2, 0]] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12 >>> data.iloc[:, :2] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.iloc[where_i, where_j] 选择多行多列 >>> data.iloc[2, :2] one 8 two 9 Name: Utah, dtype: int64 >>> data.iloc[:2, :2] one two Ohio 0 1 Colorado 4 5
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna() name toy born 1 Batman Batmobile 1940-04-25
删除有缺失值的列
设定dropna函数的参数axis为1可以删除有缺失值的列
1 2 3 4 5 6 7 8 9 10 11 12 13
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna(axis=1) name 0 Alfred 1 Batman 2 Catwoman
8. 重复值
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], ... 'k2': [1, 1, 2, 3, 3, 4, 4]}) >>> data k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
8.1. 删除重复值
可以使用函数drop_duplicates()删除重复值
1 2 3 4 5 6 7 8
>>> data.drop_duplicates() k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
函数drop_duplicates()默认考虑的是全部列,也可以设定某些列来判断是否重复
1 2 3 4
>>> data.drop_duplicates(['k1']) k1 k2 0 one 1 1 two 1
>>> df.sort_values(by='a') a b 0 0 4 2 0 -3 1 1 7 3 1 2 >>> df.sort_values(by=['a', 'b']) a b 2 0 -3 0 0 4 3 1 2 1 1 7
10. GroupBy
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> df = pd.DataFrame({'key1':['a','a','b','b','a'], ... 'key2':['one','two','one','two','one'], ... 'data1':np.random.randn(5), ... 'data2':np.random.randn(5)}) >>> df data1 data2 key1 key2 0 2.462027 0.054159 a one 1 0.283423 -0.658160 a two 2 -0.969307 -0.407126 b one 3 -0.636756 1.925338 b two 4 -0.408266 1.833710 a one
>>> df1 = pd.DataFrame({'A': ['a', 'b', 'c']}) >>> df1 A 0 a 1 b 2 c >>> df2 = pd.DataFrame({'B': [0, 1, 2]}) >>> df2 B 0 0 1 1 2 2 >>> pd.merge(df1.assign(foo=0), df2.assign(foo=0), on=['foo']).drop(columns=['foo']) A B 0 a 0 1 a 1 2 a 2 3 b 0 4 b 1 5 b 2 6 c 0 7 c 1 8 c 2
import os import requests from retrying import retry from io import BytesIO from PIL import Image import progressbar import concurrent.futures as concurrent
with concurrent.ThreadPoolExecutor(max_workers=num_workers) as executor: image_urls = [] image_paths = [] for image_id in range(num_images): url = base_url + '/' + str(image_id + 1) + suffix file_path = os.path.join(image_dir, str(image_id + 1) + suffix) image_urls.append(url) image_paths.append(file_path)
tasks = { executor.submit(download, url, file_path): (url, file_path) for url, file_path in zip(image_urls, image_paths) }
i = 0 total = len(image_urls) pbar = progressbar.ProgressBar(max_value=total).start() for task in concurrent.as_completed(tasks): url, file_path = tasks[task] try: task.result() i = i + 1 pbar.update(i) except Exception as exc: print('{} generated an exception: {}'.format(url, exc)) pbar.finish()
4. 遍历文件夹中所有文件
首先目录结构如下:
1 2 3 4 5 6 7
$ tree test test ├── 1.txt ├── 2.txt └── test2 ├── 3.txt └── 4.txt
使用os.walk()遍历test目录,代码如下:
1 2 3 4 5 6
root_dir = '/tmp/test' for root, dirs, files in os.walk(root_dir, topdown=True): for name in files: print(os.path.join(root, name)) for name in dirs: print(os.path.join(root, name))
deftimethis(func): """ Decorator that reports the execution time """ @wraps(func) defwrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) end = time.time() print(func.__name__, end-start)
if __name__ == "__main__": a = [1, 2] b = 3 print(f"'a' is {'iterable'if isiterable(a) else'not iterable'}") print(f"'b' is {'iterable'if isiterable(b) else'not iterable'}")