$ git submodule sync
$ git submodule update --init --recursive
$ conda create -y -n pytorch-study python=3.9
$ conda activate pytorch-study
$ conda install -y cmake ninja
$ pip install -r requirements.txt
$ conda install -y mkl mkl-include
$ conda install -y -c pytorch magma-cuda121
$ conda install -y gcc_linux-64=11.2.0
$ conda install -y gxx_linux-64=11.2.0
$ echo $PATH | tr ":" "\n"
/home/tk/anaconda3/envs/pytorch-study/bin
/usr/bin
...
$ # there is a bug in cmake that calls `gcc' directly.
$ # here we need to make sure the conda-version gcc is used.
$ # because the above output indicates conda-PATH is still
$ # prioritized than the /usr/bin, we can do:
$ cd $CONDA_PREFIX/bin
$ ln -s x86_64-conda-linux-gnu-gcc gcc
$ ln -s x86_64-conda-linux-gnu-g++ g++
$ cd -
$ nvidia-smi | grep CUDA
NVIDIA-SMI 535.54.03
Driver Version: 535.54.03
CUDA Version: 12.2
$ conda install -y -c nvidia/label/cuda-12.1.1 cuda
$ env | grep CUDA
CUDA_PATH=/opt/cuda
$ unset CUDA_PATH
$ echo $CMAKE_PREFIX_PATH | tr ":" "\n"
/home/tk/anaconda3/envs/pytorch-study
/home/tk/anaconda3/envs/pytorch-study/x86_64-conda-linux-gnu/sysroot/usr
$ python setup.py developOptionally, delete the easy_install link so that we do not mistake the local torch package:
$ rm -f $CONDA_PREFIX/lib/python3.9/site-packages/easy-install.pthShow installed Pytorch information:
$ ls torch/lib
cmake libjitbackend_test.so libtensorpipe.a
libXNNPACK.a libkineto.a libtensorpipe_cuda.a
libasmjit.a libnnpack.a libtensorpipe_uv.a
libbackend_with_compiler.so libprotobuf-lite.a libtorch.so
libc10.so libprotobuf.a libtorch_cpu.so
libc10_cuda.so libprotoc.a libtorch_cuda.so
libc10d_cuda_test.so libpthreadpool.a libtorch_cuda_linalg.so
libcaffe2_nvrtc.so libpytorch_qnnpack.a libtorch_global_deps.so
libclog.a libqnnpack.a libtorch_python.so
libcpuinfo.a libshm libtorchbind_test.so
libdnnl.a libshm.so pkgconfig
libfbgemm.a libshm_windows python3.9
libfmt.a libsleef.a
$ python -c 'import torch; print(torch.cuda.is_available())'
True
$ python -c 'import torch; print(torch.version.cuda)'
12.1
$ python -c 'import torch; print(torch.__version__)'
2.2.0a0+git**a5dd6de**
$ git show --quiet HEAD
commit **a5dd6de**9e7c3e7c33887fb8ee845ba97024a0fe7 (HEAD -> main, origin/main, origin/HEAD)
Author: w32zhong <clock126@126.com>
Date: Wed Nov 15 22:02:24 2023 -0500
update READMEGood to know: the setup.py will generate Ninja build files as long as the ninja command is installed (see this).
At top leve:
cd torch
ln -sf ../build/lib.linux-x86_64-cpython-39/torch/_C.cpython-39-x86_64-linux-gnu.so .
cd -
cmake --build ./build --target install --config ReleaseFor cmake:
cd hello-world
mkdir -p build
cd build && cmake -G Ninja ..
cmake --build . --target all # or ninja allFor pytorch:
$ python hello-world.py
tensor([0.8506], grad_fn=<SigmoidBackward0>)
tensor(0.0223, grad_fn=<MseLossBackward0>)By directly invoke cmake with --verbose option:
cd build
cmake --trace . | tee trace.log
cmake --build . --target install --config Release --verbose | tee build.logyou can see all build commands.
See build-stage1.log and build-stage2.log for my build logs.
You can also extract the building structure:
python extract_build_structure.py | tee build-struct.logthe output is saved in build-struct.log.
Another way is to use the build/compile_commands.json generated by the build system.
The output can be used to draw a module dependency graph as shown at the top.
To redo the whole process of Python package building (without compiling dependencies like Caffe2 etc.):
$ find . -name '_C.*.so' | xargs rm -f
$ python setup.py develop
...
copying functorch/functorch.so -> build/lib.linux-x86_64-cpython-39/functorch/_C.cpython-39-x86_64-linux-gnu.so
building 'torch._C' extension
gcc -c torch/csrc/stub.c -o build/temp.linux-x86_64-cpython-39/torch/csrc/stub.o
gcc build/temp.linux-x86_64-cpython-39/torch/csrc/stub.o -L torch/lib -ltorch_python.so -o build/lib.linux-x86_64-cpython-39/torch/_C.cpython-39-x86_64-linux-gnu.so
copying build/lib.linux-x86_64-cpython-39/torch/_C.cpython-39-x86_64-linux-gnu.so -> torch
copying build/lib.linux-x86_64-cpython-39/functorch/_C.cpython-39-x86_64-linux-gnu.so -> functorch
...
$ find . -name '_C.*.so'
./build/lib.linux-x86_64-cpython-39/functorch/_C.cpython-39-x86_64-linux-gnu.so
./build/lib.linux-x86_64-cpython-39/torch/_C.cpython-39-x86_64-linux-gnu.so
./torch/_C.cpython-39-x86_64-linux-gnu.so
./functorch/_C.cpython-39-x86_64-linux-gnu.soNow we know _C.cpython-39-x86_64-linux-gnu.so is just a stub.o (which calls initModule) plus libtorch_python.so.
We can verify libtorch_python.so has defined the initModule:
$ nm --defined-only build/lib/libtorch_python.so | grep initModule
000000000073cdc0 T _Z20THPEngine_initModuleP7_object
0000000000741040 T _Z22THPFunction_initModuleP7_object
000000000076d720 T _Z22THPVariable_initModuleP7_object
0000000000b6bf70 T _ZN5torch3cpu10initModuleEP7_object
0000000000b8a0d0 T _ZN5torch4cuda10initModuleEP7_object
00000000006afa80 T initModule(initModule is defined in torch/csrc/Module.cpp)
If compiled with debug information, we can easily find out the source file:
$ nm -Cl --defined-only libtorch_cpu.so | grep "at::_ops::empty_memory_format::call"
00000000020950be T at::_ops::empty_memory_format::call(c10::ArrayRef<c10::SymInt>, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, c10::optional<c10::MemoryFormat>) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/Operators_2.cpp:3231
To know which function is defined at which source code file, we need to build with debug information:
CUDA_DEVICE_DEBUG=1 DEBUG=1 REL_WITH_DEB_INFO=1 USE_DISTRIBUTED=0 USE_MKLDNN=0 BUILD_CAFFE2=0 BUILD_TEST=0 USE_NNPACK=0 USE_XNNPACK=0 USE_QNNPACK=0 USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 python setup.py develop --verbose
and now you can see debug sections in ELF and see symbol table with source paths:
$ readelf -S build/lib/libtorch_python.so | grep debug
[28] .debug_aranges PROGBITS 0000000000000000 01a6a2f0
[29] .debug_info PROGBITS 0000000000000000 021ca0b0
[30] .debug_abbrev PROGBITS 0000000000000000 0ccf5dad
[31] .debug_line PROGBITS 0000000000000000 0ce883d4
[32] .debug_str PROGBITS 0000000000000000 0e3c1686
[33] .debug_line_str PROGBITS 0000000000000000 111d5e0d
[34] .debug_loclists PROGBITS 0000000000000000 111e6c43
[35] .debug_rnglists PROGBITS 0000000000000000 111f549c
$ nm -C -D -l -g build/lib/libtorch_python.so | grep "initModule"
00000000008f5e83 T THPEngine_initModule(_object*) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/python_engine.cpp:475
000000000090655a T THPFunction_initModule(_object*) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/python_function.cpp:1600
0000000000943204 T THPVariable_initModule(_object*) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/python_variable.cpp:2197
00000000010d25b8 T torch::cpu::initModule(_object*) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/cpu/Module.cpp:8
00000000010f50f1 T torch::cuda::initModule(_object*) /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/cuda/Module.cpp:1533
00000000007b21c7 T initModule /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/Module.cpp:1346
And to breakpoint at a C level function in gdb:
$ gdb python
GNU gdb (GDB) 13.1
Copyright (C) 2023 Free Software Foundation, Inc.
...
(gdb) b initModule
Function "initModule" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (initModule) pending.
(gdb) run
Starting program: /home/tk/anaconda3/envs/pytorch-study/bin/python
This GDB supports auto-downloading debuginfo from the following URLs:
<https://debuginfod.archlinux.org>
Enable debuginfod for this session? (y or [n]) y
Python 3.9.18 (main, Sep 11 2023, 13:41:44)
[GCC 11.2.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
Breakpoint 1.1, initModule ()
at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/Module.cpp:1346
1346 PyObject* initModule() {Alternatively, in one command:
gdb -ex "b initModule" -ex run --args python hello-world.pyand to add a breakpoint at file:line:
gdb -ex "b library.cpp:228" -ex run pythonTo debug a already running python:
$ python
Python 3.9.18 (main, Sep 11 2023, 13:41:44)
[GCC 11.2.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import os; print(os.getpid())
2031925
>>>in another terminal:
$ sudo bash -c 'echo 0 > /proc/sys/kernel/yama/ptrace_scope'
$ gdb -p 2031925 python
...
(gdb) break THPVariable_tensor
Function "THPVariable_tensor" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (THPVariable_tensor) pending.
(gdb) c
Continuing.then back in the runnig python:
>>> a=torch.tensor([1])this will triger gdb to show
Thread 1 "python" hit Breakpoint 1, torch::autograd::THPVariable_tensor (self=0x0,
args=0x7f279229e6a0, kwargs=0x0)
at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/python_torch_functions_manual.cpp:248
248 PyObject* kwargs) {
After setup, one can use the following command to trace the cmake execution.
cd build
cmake .. --trace-expand &> trace.logHere are some of the important cmake files:
- CMakeLists.txt
add_subdirectory(caffe2)include(cmake/public/utils.cmake)include(cmake/Dependencies.cmake)include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
- cmake/public/utils.cmake
macro(caffe2_interface_library SRC DST)(think it asadd_dependencies(${DST} ${SRC}))
- cmake/Codegen.cmake
function(append_filelist name outputvar)that reads source files from build_variables.bzl
- caffe2/CMakeLists.txt
add_library(torch ${DUMMY_EMPTY_FILE})add_subdirectory(../aten aten)target_link_libraries(torch PUBLIC torch_cuda_library): this is the place most of dependencies (expanded fromtorch_cuda_library) are attached tolibtorch.socaffe2_interface_library(torch_cuda torch_cuda_library):torch_cuda_library=>torch_cudaadd_subdirectory(../torch torch)add_library(torch_cuda ${Caffe2_GPU_SRCS} ${Caffe2_GPU_CU_SRCS}):torch_cuda=> a lot of source files.list(APPEND Caffe2_GPU_SRCS ${ATen_CUDA_CPP_SRCS})andlist(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS})
- aten/CMakeLists.txt
set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE)
- aten/src/ATen/CMakeLists.txt
list(APPEND ATen_CUDA_CPP_SRCS ${cuda_cpp} ${native_cuda_cpp} ...)
- torch/CMakeLists.txt
add_dependencies(torch_python gen_torch_version), meaning thatlibtorch_python.sodepends ongen_torch_version.add_library(torch_python SHARED ${TORCH_PYTHON_SRCS}), similarly,libtorch_python.sodepends on${TORCH_PYTHON_SRCS}which can be expanded by a simple Python line (see below).add_dependencies(torch_python torch_python_stubs)add_dependencies(torch_python generate-torch-sources)target_link_libraries(torch_python PRIVATE torch_library ${TORCH_PYTHON_LINK_LIBRARIES})where${TORCH_PYTHON_LINK_LIBRARIES}depends onATEN_CPU_FILES_GEN_LIB
For lines like append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS), we can repreduce the variable being set here, i.e., TORCH_PYTHON_SRCS:
$ python
Python 3.8.16 (default, Mar 2 2023, 03:21:46)
[GCC 11.2.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> exec(open('build_variables.bzl').read())
>>> for dep in libtorch_python_core_sources:
... print(dep)
...
torch/csrc/DataLoader.cpp
torch/csrc/Device.cpp
torch/csrc/Dtype.cpp
torch/csrc/DynamicTypes.cpp
torch/csrc/Exceptions.cpp
torch/csrc/Generator.cpp
torch/csrc/Layout.cpp
torch/csrc/MemoryFormat.cpp
torch/csrc/QScheme.cpp
torch/csrc/Module.cpp
torch/csrc/PyInterpreter.cpp
torch/csrc/python_dimname.cpp
torch/csrc/Size.cpp
torch/csrc/Storage.cpp
torch/csrc/StorageMethods.cpp
torch/csrc/StorageSharing.cpp
torch/csrc/Stream.cpp
torch/csrc/TypeInfo.cpp
torch/csrc/api/src/python/init.cpp
torch/csrc/autograd/functions/init.cpp
torch/csrc/autograd/init.cpp
...We can double check under the cmake debug flag:
cmake -DPRINT_CMAKE_DEBUG_INFO=1 ..By utilizing ninja, we can browse dependency clearly on browser. For build target torch_python:
# assume we are still in ./build here.
ninja -t browse -p 8080 torch_pythonTaking tensor.empty() operator as an example here.
// torch/library.h
#define TORCH_LIBRARY(ns, m) \
static void TORCH_LIBRARY_init_##ns(torch::Library&); \
static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_##ns( \
torch::Library::DEF, \
&TORCH_LIBRARY_init_##ns, \
#ns, \
c10::nullopt, \
__FILE__, \
__LINE__); \
void TORCH_LIBRARY_init_##ns(torch::Library& m)
#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
#define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \
static void C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&); \
static const torch::detail::TorchLibraryInit C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)( \
torch::Library::IMPL, \
(c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k) \
? &C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
: [](torch::Library&) -> void {}), \
#ns, \
c10::make_optional(c10::DispatchKey::k), \
__FILE__, \
__LINE__); \
void C10_CONCATENATE( \
TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m)
// build/aten/src/ATen/RegisterSchema.cpp
TORCH_LIBRARY(aten, m) {
m.def("empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", {at::Tag::core, at::Tag::pt2_compliant_tag});
}
// build/aten/src/ATen/RegisterBackendSelect.cpp
TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
m.impl("aten::empty.memory_format", TORCH_FN(empty_memory_format));
}
// ./build/aten/src/ATen/RegisterCPU.cpp
TORCH_LIBRARY_IMPL(aten, CPU, m) {
m.impl("empty.memory_format", TORCH_FN(wrapper_CPU_memory_format_empty));
}
// ./build/aten/src/ATen/RegisterFunctionalization_0.cpp
TORCH_LIBRARY_IMPL(aten, CPU, m) {
m.impl("detach_", TORCH_FN(functionalization::detach_));
}
// similarly in ./build/aten/src/ATen/RegisterCUDA.cpp
// similarly in ./build/aten/src/ATen/RegisterMkldnnCPU.cpp
// fallback in aten/src/ATen/ConjugateFallback.cpp, for example:
TORCH_LIBRARY_IMPL(aten, Conjugate, m) {
m.impl("empty.memory_format", torch::CppFunction::makeFallthrough());
}After macro expansion:
// build/aten/src/ATen/RegisterSchema.cpp
static void TORCH_LIBRARY_init_aten(torch::Library&);
static const torch::detail::TorchLibraryInit
TORCH_LIBRARY_static_init_aten(
torch::Library::DEF,
&TORCH_LIBRARY_init_aten,
"aten", // namespace
c10::nullopt, // no DispatchKey for "def"
"filename.cpp",
4321
);
void TORCH_LIBRARY_init_aten(torch::Library& m) {
m.def("empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", {at::Tag::core, at::Tag::pt2_compliant_tag});
}
// taking ./build/aten/src/ATen/RegisterCPU.cpp as an example:
static void TORCH_LIBRARY_IMPL_init_aten_CPU_123(torch::Library&);
static const torch::detail::TorchLibraryInit
TORCH_LIBRARY_IMPL_static_init_aten_CPU_123(
torch::Library::IMPL,
(
c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::CPU)?
&TORCH_LIBRARY_IMPL_init_aten_CPU_123 : [](torch::Library&) -> void {}
),
"aten", // namespace
c10::make_optional(c10::DispatchKey::CPU), // CPU is the DispatchKey for this "impl"
"filename.cpp",
1234
);
void TORCH_LIBRARY_IMPL_init_aten_CPU_123(torch::Library &m) {
m.impl("empty.memory_format", TORCH_FN(wrapper_CPU_memory_format_empty));
}The torch::detail::TorchLibraryInit and Library classes:
// torch/library.h
namespace detail {
class TorchLibraryInit final {
private:
using InitFn = void(Library&);
Library lib_;
public:
// the constructor initializes the member `lib_` and call `fn` right away.
TorchLibraryInit(
Library::Kind kind,
InitFn* fn,
const char* ns,
c10::optional<c10::DispatchKey> k,
const char* file,
uint32_t line)
: lib_(kind, ns, k, file, line) {
fn(lib_);
}
};
} // namespace detail
class TORCH_API Library final {
public:
enum Kind {
DEF,
IMPL,
FRAGMENT,
}
// ...
private:
Kind kind_;
c10::optional<std::string> ns_;
c10::optional<c10::DispatchKey> dispatch_key_;
const char* file_;
uint32_t line_;
};
// aten/src/ATen/core/library.cpp
Library::Library(Kind kind, std::string ns, c10::optional<c10::DispatchKey> k, const char* file, uint32_t line)
: kind_(kind)
, ns_(ns == "_" ? c10::nullopt : c10::make_optional(std::move(ns)))
, dispatch_key_(k.value_or(CatchAll) == CatchAll ? c10::nullopt : k)
, file_(file)
, line_(line)
{
switch (kind_) {
case DEF:
registrars_.emplace_back(
c10::Dispatcher::singleton().registerLibrary(
*ns_, debugString(file_, line_)
) // this will invoke libraries_.emplace(ns, std::move(debug));
);
[[fallthrough]];
case IMPL:
// Nothing to do, everything is OK
break;
}
}Basically, for
- DEF: register a new
lib_in Dispatcher::singleton(), and callTORCH_LIBRARY_init_aten(lib_)in whichm.def(...)is called. - IMPL: only call
TORCH_LIBRARY_IMPL_init_aten_Conjugate_123(lib_)in whichm.impl(...)is called.
For m.def(), it parses the schema and call _def to set table[op] = schema:
// torch/library.h
class TORCH_API Library final {
// ...
template <typename Schema>
Library& def(
Schema&& raw_schema,
const std::vector<at::Tag>& tags = {},
_RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
c10::FunctionSchema s = schema(std::forward<Schema>(raw_schema)); // step 1
return _def(std::move(s), nullptr, tags, rv);
}
};
inline c10::FunctionSchema schema(const char* str) {
c10::FunctionSchema s = torch::jit::parseSchema(str); // step 2 (parse)
s.setAliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA);
return s;
}
// aten/src/ATen/core/library.cpp
Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name, const std::vector<at::Tag>& tags, _RegisterOrVerify rv) & {
switch (rv) {
case _RegisterOrVerify::REGISTER:
registrars_.emplace_back(
c10::Dispatcher::singleton().registerDef(
std::move(schema), // step 3
debugString(file_, line_),
tags
)
);
break;
}
return *this;
}
// ./aten/src/ATen/core/dispatch/Dispatcher.cpp
RegistrationHandleRAII Dispatcher::registerDef(FunctionSchema schema, std::string debug, std::vector<at::Tag> tags) {
OperatorName op_name = schema.operator_name(); // op_name is just schema.name_
// (gdb) whatis schema.name_
// type = c10::OperatorName
// (gdb) whatis op_name
// type = c10::OperatorName
if (op_name.name == "aten::empty" && op_name.overload_name == "memory_format") {
::std::cout<< "register Def " << op_name.name << " " << op_name.overload_name << " with " << schema << " @ " << debug << "\n";
// and it will print:
// register Def aten::empty memory_format with aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSchema.cpp:6
}
// step 4 (actual register)
// think the next two lines as table[op_name] = schema
OperatorHandle op = findOrRegisterName_(op_name);
op.operatorDef_->op.registerSchema(std::move(schema), std::move(debug), std::move(tags));
}
OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
const auto found = findOp(op_name);
if (found != c10::nullopt) {
return *found;
}
operators_.emplace_back(OperatorName(op_name));
OperatorHandle handle(--operators_.end());
operatorLookupTable_.write([&] (ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) {
operatorLookupTable.emplace(op_name, handle);
});
return handle;
}For m.impl(), recall one of its implementation on CPU backend is
m.impl("empty.memory_format", TORCH_FN(wrapper_CPU_memory_format_empty));and m.impl() is just a wrapper on _impl, and the latter calls registerKernel(...):
// torch/library.h
class TORCH_API Library final {
template <typename Name, typename Func>
Library& impl(
Name name,
Func&& raw_f,
_RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
CppFunction f(std::forward<Func>(raw_f));
return _impl(name, std::move(f), rv);
}
};
// aten/src/ATen/core/library.cpp
Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify rv) & {
at::OperatorName op_name = _parseNameForLib(name_str);
auto dispatch_key = f.dispatch_key_.has_value() ? f.dispatch_key_ : dispatch_key_;
switch (rv) {
case _RegisterOrVerify::REGISTER:
registrars_.emplace_back(
c10::Dispatcher::singleton().registerImpl(
std::move(op_name),
dispatch_key,
std::move(f.func_),
f.cpp_signature_,
std::move(f.schema_),
debugString(std::move(f.debug_), file_, line_)
)
);
break;
}
return *this;
}
// aten/src/ATen/core/dispatch/Dispatcher.cpp
RegistrationHandleRAII Dispatcher::registerImpl(
OperatorName op_name,
c10::optional<DispatchKey> dispatch_key,
KernelFunction kernel,
c10::optional<impl::CppSignature> cpp_signature,
std::unique_ptr<FunctionSchema> inferred_function_schema,
std::string debug
) {
auto op = findOrRegisterName_(op_name);
if (op_name.name == "aten::empty" && op_name.overload_name == "memory_format") {
::std::cout<< "register Impl " << op_name.name << " " << op_name.overload_name << " - " << *dispatch_key << " @ " << debug << "\n";
auto handle = op.operatorDef_->op.registerKernel(
*this,
dispatch_key,
std::move(kernel),
std::move(cpp_signature),
std::move(inferred_function_schema),
std::move(debug)
);
++op.operatorDef_->def_and_impl_count;
// ...
}Overall, this is what happens after import torch:
>>> import torch
register Impl aten::empty memory_format - Conjugate @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/ConjugateFallback.cpp:21
register Impl aten::empty memory_format - ZeroTensor @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/ZeroTensorFallback.cpp:90
register Impl aten::empty memory_format - Negative @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/native/NegateFallback.cpp:23
register lib sparse @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/native/ao_sparse/library.cpp:9
register lib quantized @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/native/quantized/library.cpp:16
register lib _quantized @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/native/quantized/library.cpp:234
register lib onednn @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/native/quantized/library.cpp:255
register Impl aten::empty memory_format - BackendSelect @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterBackendSelect.cpp:807
register Impl aten::empty memory_format - CPU @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterCPU.cpp:31343
register Impl aten::empty memory_format - Meta @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterMeta.cpp:26984
register Impl aten::empty memory_format - MkldnnCPU @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterMkldnnCPU.cpp:515
register Impl aten::empty memory_format - QuantizedCPU @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterQuantizedCPU.cpp:944
register Impl aten::empty memory_format - QuantizedMeta @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterQuantizedMeta.cpp:105
register lib aten @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSchema.cpp:6
register Def aten::empty memory_format with aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSchema.cpp:6
register Impl aten::empty memory_format - SparseCPU @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSparseCPU.cpp:1387
register Impl aten::empty memory_format - SparseCsrCPU @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSparseCsrCPU.cpp:1135
register Impl aten::empty memory_format - SparseMeta @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSparseMeta.cpp:249
register lib _nnapi @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/aten/src/ATen/nnapi/nnapi_register.cpp:12
register Impl aten::empty memory_format - Autograd @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/generated/VariableType_2.cpp:19039
register Impl aten::empty memory_format - Tracer @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/autograd/generated/TraceType_2.cpp:17346
register Impl aten::empty memory_format - CUDA @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterCUDA.cpp:44396
register Impl aten::empty memory_format - QuantizedCUDA @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterQuantizedCUDA.cpp:459
register Impl aten::empty memory_format - SparseCUDA @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSparseCUDA.cpp:1573
register Impl aten::empty memory_format - SparseCsrCUDA @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/build/aten/src/ATen/RegisterSparseCsrCUDA.cpp:1276
register lib cuda @ registered at /home/tk/Desktop/nvme0n1/pytorch-that-I-successfully-built/torch/csrc/jit/cuda/cuda.h:156
torch initModule BEGIN
torch initModule END
register lib rngprims @ registered at /dev/null:228
register lib prims @ registered at /dev/null:228
register lib triton @ registered at /dev/null:1834
For a tensor allocation like:
a=torch.tensor([1])The code to run:
// torch/csrc/Module.cpp
PyObject* initModule() {
static struct PyModuleDef torchmodule = {
PyModuleDef_HEAD_INIT, "torch._C", nullptr, -1, methods.data()
};
module = PyModule_Create(&torchmodule);
ASSERT_TRUE(THPVariable_initModule(module));
}
// ./torch/csrc/autograd/python_variable.cpp
bool THPVariable_initModule(PyObject* module) {
torch::autograd::initTorchFunctions(module);
}
// ./torch/csrc/autograd/python_torch_functions_manual.cpp
void initTorchFunctions(PyObject* module) {
static std::vector<PyMethodDef> torch_functions;
gatherTorchFunctions(torch_functions); // gather common functions like tensor to torch_functions
THPVariableFunctions.tp_methods = torch_functions.data();
THPVariableFunctionsModule = PyType_GenericNew(&THPVariableFunctions, Py_None, Py_None);
PyModule_AddObject(module, "_VariableFunctions", THPVariableFunctionsModule);
}where _C._VariableFunctions are extracted to torch.* in torch/__init__.py:
# torch/__init__.py
for name in dir(_C._VariableFunctions):
obj = getattr(_C._VariableFunctions, name)
obj.__module__ = 'torch'
if not name.startswith("_"):
__all__.append(name)The stub file (see PEP 484) for the tensor interface is at torch/_C/_VariableFunctions.pyi.
And in gatherTorchFunctions:
// ./torch/csrc/autograd/python_torch_functions_manual.cpp
static PyMethodDef torch_functions_manual[] = {
{"asarray",
castPyCFunctionWithKeywords(THPVariable_asarray),
METH_VARARGS | METH_KEYWORDS | METH_STATIC,
nullptr},
// ...
{"tensor",
castPyCFunctionWithKeywords(THPVariable_tensor),
METH_VARARGS | METH_KEYWORDS | METH_STATIC,
nullptr},
};
void gatherTorchFunctions(std::vector<PyMethodDef>& torch_functions) {
constexpr size_t num_functions =
sizeof(torch_functions_manual) / sizeof(torch_functions_manual[0]);
torch_functions.assign(
torch_functions_manual, torch_functions_manual + num_functions);
}
// ./torch/csrc/autograd/python_torch_functions_manual.cpp
static PyObject* THPVariable_tensor(
PyObject* self,
PyObject* args,
PyObject* kwargs) {
static PythonArgParser parser({
"tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False, DimnameList? names=None)",
});
ParsedArgs<ctor_num_args> parsed_args;
auto r = parser.parse(args, kwargs, parsed_args);
return THPVariable_Wrap(torch::utils::tensor_ctor(
torch::tensors::get_default_dispatch_key(),
torch::tensors::get_default_scalar_type(),
r));
}
// ./torch/csrc/utils/tensor_new.cpp
Tensor tensor_ctor(
c10::DispatchKey dispatch_key,
at::ScalarType scalar_type,
PythonArgs& r) {
PyObject* data = r.pyobject(0);
bool type_inference = r.isNone(1);
bool pin_memory = r.toBool(3);
bool args_requires_grad = r.toBool(4);
auto new_tensor = internal_new_from_data(
typeIdWithDefault(r, 2, dispatch_key),
r.scalartypeWithDefault(1, scalar_type),
r.deviceOptional(2),
data,
/*copy_variables=*/true,
/*copy_numpy=*/true,
/*type_inference=*/type_inference,
pin_memory);
new_tensor.detach_(); // call [aten::detach_]
new_tensor.set_requires_grad(args_requires_grad);
return new_tensor;
}
Tensor internal_new_from_data(
c10::TensorOptions options,
at::ScalarType scalar_type,
c10::optional<Device> device_opt,
PyObject* data,
bool copy_variables,
bool copy_numpy,
bool type_inference,
bool pin_memory = false) {
auto device = device_opt.has_value() ? *device_opt : options.device();
auto sizes = compute_sizes(data, scalar_type);
ScalarType inferred_scalar_type =
type_inference ? infer_scalar_type(data) : scalar_type;
Tensor tensor;
{
tensor = at::empty(sizes, opts.pinned_memory(pin_memory)); // call [aten::empty.memory_format]
recursive_store(
(char*)tensor.data_ptr(),
tensor.sizes(),
tensor.strides(),
0,
inferred_scalar_type,
tensor.dtype().itemsize(),
data);
tensor = tensor.to(device, inferred_scalar_type); // call [aten::to.device]
}
return at::lift_fresh(tensor); // call [aten::lift_fresh]
# lift_fresh is called with an argument that is guaranteed to be
# fresh (i.e., newly allocated). This is ONLY called from a
# torch.tensor call; The default implementation of lift is a no-op.
# See build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
# and ./aten/src/ATen/native/TensorShape.cpp
}
void recursive_store(
char* data,
IntArrayRef sizes,
IntArrayRef strides,
int64_t dim,
ScalarType scalarType,
size_t elementSize,
PyObject* obj) {
int64_t ndim = static_cast<int64_t>(sizes.size());
bool is_symfloat = torch::is_symfloat(obj);
bool is_symint = torch::is_symint(obj);
if (dim == ndim) {
if (is_symfloat) {
auto new_obj = py::reinterpret_borrow<py::object>(obj);
auto val = new_obj.cast<c10::SymFloat>();
switch (elementSize) {
case 8:
*reinterpret_cast<double*>(data) = val;
break;
case 4:
*reinterpret_cast<float*>(data) = static_cast<float>(val);
break;
}
return;
}
if (is_symint) {
// ...
}
torch::utils::store_scalar(data, scalarType, obj);
return;
}
auto n = sizes[dim];
auto seq = THPObjectPtr(PySequence_Fast(obj, "not a sequence"));
PyObject** items = PySequence_Fast_ITEMS(seq.get());
for (const auto i : c10::irange(n)) {
recursive_store(
data, sizes, strides, dim + 1, scalarType, elementSize, items[i]);
data += strides[dim] * elementSize;
}
}
// ./torch/csrc/utils/python_scalars.h
inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
switch (scalarType) {
case at::kByte:
*(uint8_t*)data = unpackIntegral<uint8_t>(obj, "uint8");
break;
case at::kInt:
*(int32_t*)data = unpackIntegral<int32_t>(obj, "int32");
break;
case at::kHalf:
*(at::Half*)data =
at::convert<at::Half, double>(THPUtils_unpackDouble(obj));
break;
case at::kFloat:
*(float*)data = (float)THPUtils_unpackDouble(obj);
break;
case at::kDouble:
*(double*)data = THPUtils_unpackDouble(obj);
break;
// ...
}
}Starting from
// ./torch/csrc/utils/tensor_new.cpp
tensor = at::empty(sizes, opts.pinned_memory(pin_memory)); // call Operator!
// ./build/aten/src/ATen/ops/empty.h
inline at::Tensor empty(at::IntArrayRef size, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
return at::_ops::empty_memory_format::call(c10::fromIntArrayRefSlow(size), optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
}
// ./build/aten/src/ATen/ops/empty_ops.h
struct TORCH_API empty_memory_format {
using schema = at::Tensor (c10::SymIntArrayRef, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>, c10::optional<at::MemoryFormat>);
using ptr_schema = schema*;
static constexpr const char* name = "aten::empty";
static constexpr const char* overload_name = "memory_format";
static constexpr const char* schema_str = "empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor";
static at::Tensor call(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format);
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format);
};
// ./build/aten/src/ATen/Operators_2.cpp
static C10_NOINLINE c10::TypedOperatorHandle<empty_memory_format::schema>
create_empty_memory_format_typed_handle() {
return c10::Dispatcher::singleton()
.findSchemaOrThrow(empty_memory_format::name, empty_memory_format::overload_name)
.typed<empty_memory_format::schema>();
}
at::Tensor empty_memory_format::call(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
static auto op = create_empty_memory_format_typed_handle();
return op.call(size, dtype, layout, device, pin_memory, memory_format);
}
// ./aten/src/ATen/core/operator_name.h
struct OperatorName final {
std::string name;
std::string overload_name;
OperatorName(std::string name, std::string overload_name)
: name(std::move(name)), overload_name(std::move(overload_name)) {}
};
// ./aten/src/ATen/core/dispatch/Dispatcher.cpp
OperatorHandle Dispatcher::findSchemaOrThrow(const char* name, const char* overload_name) {
auto it = findSchema({name, overload_name});
// (gdb) whatis it
// type = c10::optional<c10::OperatorHandle>
if (!it.has_value()) {
auto it2 = findOp({name, overload_name});
if (!it2.has_value()) {
TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name);
} else {
TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name,
" but we found an implementation; did you forget to def() the operator?");
}
}
// whatis it.value()
// type = c10::OperatorHandle &
return it.value();
}
c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& op_name) {
c10::optional_base<c10::OperatorHandle> it = findOp(op_name);
// (gdb) p (*it).schema().dump()
if (it.has_value()) {
if (it->hasSchema()) {
return it;
} else {
return c10::nullopt;
}
} else {
return it;
}
}
c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& op_name) {
return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> c10::optional<OperatorHandle> {
auto found = operatorLookupTable.find(op_name);
if (found == operatorLookupTable.end()) {
return c10::nullopt;
}
return found->second;
});
}By looking at the OperatorHandle class code:
// ./aten/src/ATen/core/dispatch/Dispatcher.h
class TORCH_API OperatorHandle {
template<class FuncType>
TypedOperatorHandle<FuncType> typed() const {
return TypedOperatorHandle<FuncType>(operatorIterator_);
}
};
template<class Return, class... Args>
class TypedOperatorHandle<Return (Args...)> final : public OperatorHandle {
C10_ALWAYS_INLINE Return call(Args... args) const {
return c10::Dispatcher::singleton().call<Return, Args...>(*this, std::forward<Args>(args)...);
}
};we know the result of the aforementioned chain is
c10::Dispatcher::singleton()
.findSchemaOrThrow(...) /* OperatorHandle */
.typed<empty_memory_format::schema>() /* TypedOperatorHandle */
.call() /* calling Dispatcher::singleton().call() */and its call() is invoking:
// ./aten/src/ATen/core/dispatch/Dispatcher.h
template<class Return, class... Args>
C10_ALWAYS_INLINE_UNLESS_MOBILE Return
Dispatcher::call(const TypedOperatorHandle<Return(Args...)>& op, Args... args) const {
// (gdb) p op.operatorDef_->op.listAllDispatchKeys().c_str()
// "[CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, QuantizedMeta, MkldnnCPU"...
c10::DispatchKeySet dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
.template getDispatchKeySetUnboxed<Args...>(args...);
std::cerr << "[call] op=[" << op.operator_name() << "], key=[" << dispatchKeySet.highestPriorityTypeId() << "]" << std::endl;
const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
}
// ./aten/src/ATen/core/boxing/KernelFunction_impl.h
template<class Return, class... Args>
C10_ALWAYS_INLINE Return KernelFunction::call(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Args... args) const {
if (guts::disjunction<has_symint<Args>...>::value) {
if (sym_unboxed_kernel_func_ != nullptr) {
auto *functor = boxed_kernel_func_.getFunctor();
::std::cout << "callUnboxedKernelFunction with sym" << "\n";
return callUnboxedKernelFunction<Return, Args...>(
sym_unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
}
if (unboxed_kernel_func_ != nullptr) {
auto *functor = boxed_kernel_func_.getFunctor();
::std::cout << "callUnboxedKernelFunction with sym and unboxed" << "\n";
return callUnboxedKernelFunction<Return, typename remove_symint<Args>::type...>(
unboxed_kernel_func_, functor, dispatchKeySet, unpackSymInt<Args>(args)...);
}
} else {
if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
auto *functor = boxed_kernel_func_.getFunctor();
::std::cout << "callUnboxedKernelFunction with unboxed" << "\n";
return callUnboxedKernelFunction<Return, Args...>(
unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
}
}
::std::cout << "call impl::BoxedKernelWrapper" << "\n";
return impl::BoxedKernelWrapper<Return(Args...)>::call(
boxed_kernel_func_,
opHandle,
dispatchKeySet,
std::forward<Args>(args)...
);
}The implementation is located by finding the empty_memory_format or wrapper_CPU_memory_format_empty function (or refer to native_functions.yaml):
// build/aten/src/ATen/RegisterBackendSelect.cpp
at::Tensor empty_memory_format(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(dtype, layout, device));
return at::_ops::empty_memory_format::redispatch(
_dk, size, dtype, layout, device, pin_memory, memory_format);
}
// build/aten/src/ATen/Operators_2.cpp
at::Tensor empty_memory_format::redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
static auto op = create_empty_memory_format_typed_handle();
::std::cerr << "[redispatch] op=[" << op.operator_name() << "], key=[" << dispatchKeySet.highestPriorityTypeId() << "]" << ::std::endl;
return op.redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory, memory_format);
}
// ./aten/src/ATen/core/dispatch/Dispatcher.h
template<class Return, class... Args>
inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const {
const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet);
// call KernelFunction::call again...
return kernel.template call<Return, Args...>(op, currentDispatchKeySet, std::forward<Args>(args)...);
}
// ./build/aten/src/ATen/RegisterCPU.cpp
at::Tensor wrapper_CPU_memory_format_empty(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
return at::native::empty_cpu(C10_AS_INTARRAYREF_SLOW(size), dtype, layout, device, pin_memory, memory_format);
}
// ./aten/src/ATen/native/TensorFactories.cpp
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
Tensor result = at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
return result;
}
// ./aten/src/ATen/EmptyTensor.cpp
TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
c10::optional<c10::MemoryFormat> memory_format_opt) {
c10::Allocator* allocator = GetCPUAllocatorMaybePinned(pin_memory);
constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
return _empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
}
template <typename T>
TensorBase _empty_generic(
ArrayRef<T> size,
c10::Allocator* allocator,
c10::DispatchKeySet ks,
ScalarType scalar_type,
c10::optional<c10::MemoryFormat> memory_format_opt) {
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); /* size_t */
unsigned long size_bytes = computeStorageNbytesContiguous(size, dtype.itemsize());
auto storage_impl = c10::make_intrusive<StorageImpl>(
size_bytes,
allocator,
/*resizeable=*/true);
at::TensorBase tensor = detail::make_tensor_base<TensorImpl>(
std::move(storage_impl), ks, dtype);
return tensor;
}The overall route after calling a=torch.tensor():
[call] op=[aten::empty.memory_format], key=[BackendSelect]
callUnboxedKernelFunction with sym
[redispatch] op=[aten::empty.memory_format], key=[CPU]
callUnboxedKernelFunction with sym
[call] op=[aten::to.device], key=[CPU]
callUnboxedKernelFunction with unboxed
[call] op=[aten::lift_fresh], key=[CPU]
callUnboxedKernelFunction with unboxed
[call] op=[aten::detach_], key=[AutogradCPU]
callUnboxedKernelFunction with unboxed
// ./c10/core/StorageImpl.h
struct C10_API StorageImpl : public c10::intrusive_ptr_target {
public:
StorageImpl( /* wrapper #1 */
const SymInt& size_bytes, /* size_bytes has been casted to SymInt type */
at::Allocator* allocator,
bool resizable)
: StorageImpl( /* allocate and call wrapper #2 */
size_bytes,
size_bytes.is_heap_allocated() /* 0 */
? allocator->allocate(0)
: allocator->allocate(size_bytes.as_int_unchecked()),
allocator,
resizable) {}
StorageImpl( /* wrapper #2 */
SymInt size_bytes,
at::DataPtr data_ptr,
at::Allocator* allocator,
bool resizable)
: data_ptr_(std::move(data_ptr)),
size_bytes_(std::move(size_bytes)),
size_bytes_is_heap_allocated_(size_bytes_.is_heap_allocated()),
resizable_(resizable),
received_cuda_(false),
allocator_(allocator) {}
private:
DataPtr data_ptr_;
SymInt size_bytes_;
bool size_bytes_is_heap_allocated_;
bool resizable_;
bool received_cuda_;
Allocator* allocator_;
};
// ./aten/src/ATen/core/TensorBase.h
template <typename T, typename... Args>
TensorBase make_tensor_base(Args&&... args) {
return TensorBase(c10::make_intrusive<T>(std::forward<Args>(args)...));
}
class TORCH_API TensorBase {
explicit TensorBase(
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
: impl_(std::move(tensor_impl)) {
}
};
// ./c10/core/TensorImpl.cpp
TensorImpl::TensorImpl(
Storage&& storage,
DispatchKeySet key_set,
const caffe2::TypeMeta data_type,
c10::optional<c10::Device> device_opt)
: storage_(std::move(storage)),
numel_(0), /* number of elements */
data_type_(data_type),
device_opt_(device_opt) {
init_bitfields();
if (!is_inference()) {
auto k = key_set.highestBackendKey();
key_set_ = key_set | getAutogradRelatedKeySetFromBackend(k);
}
}Remember recursive_store will save data on (char*)tensor.data_ptr().
// ./aten/src/ATen/core/TensorBase.h
class TORCH_API TensorBase {
void* data_ptr() const {
return mutable_data_ptr();
}
void* mutable_data_ptr() const {
return this->unsafeGetTensorImpl()->mutable_data();
}
TensorImpl * unsafeGetTensorImpl() const {
return impl_.get(); /* get the intrusive_ptr, here is TensorImpl */
}
};
// ./c10/core/TensorImpl.h
struct C10_API TensorImpl : public c10::intrusive_ptr_target {
inline void* mutable_data() {
return data_impl<void>(
[this] {
return static_cast<char*>(storage_.mutable_data());
}
);
}
template <typename Void, typename Func>
Void* data_impl(const Func& get_data) const {
char* data = get_data();
return data + data_type_.itemsize() * storage_offset_;
}
protected:
Storage storage_;
};
// ./c10/core/StorageImpl.h
struct C10_API StorageImpl : public c10::intrusive_ptr_target {
void* mutable_data() {
return data_ptr_.mutable_get();
}
};
// ./c10/core/Allocator.h
class C10_API DataPtr {
private:
c10::detail::UniqueVoidPtr ptr_; /* an owning smart pointer like unique_ptr */
Device device_;
public:
void* mutable_get() {
return ptr_.get(); /* return the actual C void pointer */
}
};Now, let's look at allocator:
// ./c10/core/CPUAllocator.cpp
struct C10_API DefaultCPUAllocator final : at::Allocator {
at::DataPtr allocate(size_t nbytes) const override {
void* data = nullptr;
try {
data = c10::alloc_cpu(nbytes);
} catch (c10::Error& e) {
profiledCPUMemoryReporter().OutOfMemory(nbytes);
throw e;
}
// "ptr, ctx, ctx_deleter, device" for the smart pointer to work.
return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
}
};
// ./c10/core/impl/alloc_cpu.cpp
void* alloc_cpu(size_t nbytes) {
void* data;
constexpr size_t gAlignment = 64;
// posix_memalign: work with memory aligned on larger block sizes than malloc.
int err = posix_memalign(&data, gAlignment, nbytes);
// caffe2 interface: move data to a thread's Non-Uniform Memory Access (NUMA) node
NUMAMove(data, nbytes, GetCurrentNUMANode());
if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
memset(data, 0, nbytes);
} else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
memset_junk(data, nbytes);
}
return data;
}See backprop.md
