我有一个C库,它从文件中读取二进制数据,转换它并将所有内容存储在一个大字符*中,以便将数据返回给任何调用它的东西。这在C中运行得很好,但是使用python/Cython,我在分配内存时遇到了问题。
图书馆的原型是:
int readWrapper(struct options opt, char *lineOut);
我的pyx文件:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
# Size of array
outSize = 50000
cdef char *line_output = <char *> malloc(outSize * sizeof(char))
memset(line_output, 1, outSize)
line_output[outSize] = 0
# Call reader
return_val = readWrapper(options, line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
return df
只要line_output保持在
outSize
的大小内,它就能正常工作。但是有些文件更大,所以我如何动态地做到这一点?
根据DavidW的建议编辑
读取器包装类似于:
int readWrapper(struct options opt, char **lineOut)
// Open file for reading
fp = fopen(opt.filename, "r");
// Check for valid fp
if (fp == NULL)
printf("file pointer is null, aborting\n");
return (EXIT_FAILURE);
// Allocate memory
int ARRAY_SIZE = 5000;
*lineOut = NULL;
char *outLine = malloc(ARRAY_SIZE * sizeof (char));
if (outLine == NULL)
fprintf(stderr, "Memory allocation failed!");
return(EXIT_FAILURE);
// Create line and multi lines object
char line[255];
int numWritten = 0;
int memIncrease = 10000;
while (fp != feof)
// Read part of file
reader(fp, opt, line);
size_t num2Write = strlen(line);
if (ARRAY_SIZE < (numWritten + num2Write + 1))
{ // Won't fit so enlarge outLine
ARRAY_SIZE += memIncrease;
outLine = realloc(outLine, (sizeof *outLine * ARRAY_SIZE));
if (outLine == NULL)
fprintf(stderr, "Memory re-allocation failed!");
return(EXIT_FAILURE);
sprintf(outLine + numWritten, "%s", line);
numWritten += num2Write;
} // data block loop
*lineOut = outLine;
if (fp != NULL)
fclose(fp);
return (EXIT_SUCCESS);
}
新的pyx文件:
from libc.string cimport strcpy, memset
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt, char *lineOut);
def pyreader(file, date, debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
cdef char *line_output = NULL
# Call reader
return_val = readWrapper(options, &line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
df = pd.read_csv(data, delim_whitespace=True, header=None)
# Free memory
free(line_output)
free(options.filename)
return df
这现在很好,但是在包装器(C)和python (pyx)部分中使用任何
printf
或
fprintf(stdout,...)
语句都会导致
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
BrokenPipeError: [Errno 32] Broken pipe
当使用
python3 test.py | head
时。在没有头部的情况下,不显示错误。
最后,关于文件名及其分配的建议也不适用于我。在运行时使用
options.filename = file
会在
TypeError: expected bytes, str found
中产生结果,但会编译。有趣的是,只有在运行这样调用包装器的python代码时才会发生这种情况:
python3 test.py | head
。没有管道和磁头,BrokenPipeError就不存在。因此,这不是什么大问题,但想了解是什么原因造成的。
在BrokenPipeError上搜索了一些之后进行编辑
这个BrokenPipeError问题发生在头部而不是尾部。关于这个“错误”的解释可以在这里找到: https://stackoverflow.com/a/30091579/2885280
解决方案pyx文件:
最后一个使用前面提到的readWrapper.c的reader.pyx文件。内存分配由C处理,清理(最后)由pyx代码处理。
from libc.stdlib cimport free
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
char *DAY;
options opt
int readWrapper(options opt, char **lineOut);
def pyreader(file, date, debug=0):
import logging
import sys
import errno
import pandas as pd
# Init return valus
a = pd.DataFrame()
cdef options options
cdef char *line_output = NULL
# logging
logging.basicConfig(stream=sys.stdout,
format='%(asctime)s:%(process)d:%(filename)s:%(lineno)s:pyreader: %(message)s',
datefmt='%Y%m%d_%H.%M.%S',
level=logging.DEBUG if debug > 0 else logging.INFO)
# Check inputs
if file is None:
raise Exception("No valid filename provided")
if date is None:
raise Exception("No valid date provided")
# Get the filename
file_enc = file.encode("ascii")
options.filename = file_enc
# Get date
day_enc = date.encode('ascii')
options.DAY = day_enc
# Call reader
return_val = readWrapper(options, &line_output)
if (return_val > 0):
logging.error("pyreadASTERIX2 failed with exitcode {}".format(return_val))
return a
except Exception:
logging.exception("Error occurred")
free(line_output)
return a
from io import StringIO
data = StringIO(line_output.decode('UTF-8', 'strict'))
logging.debug("return_val: {} and size: {}".format(return_val, len(line_output.decode('UTF-8', 'strict'))))
a = pd.read_csv(data, delim_whitespace=True, header=None, dtype={'id':str})
if a.empty:
logging.error("failed to load {} not enough data to construct DataFrame".format(file))
return a
logging.debug("converted data into pd")
except Exception as e:
logging.exception("Exception occured while loading: {} into DataFrame".format(file))
return a
finally:
free(line_output)
logging.debug("Size of df: {}".format(len(a)))
# Success, return DataFrame
return a
except Exception: