layout | title |
---|---|
post |
第168期 |
公众号
点击「查看原文」跳转到 GitHub 上对应文件,链接就可以点击了
qq群 点击进入
欢迎投稿,推荐或自荐文章/软件/资源等,评论区留言
本期文章由 HNY Amnesia 赞助 在此表示感谢
标准委员会动态/ide/编译器信息放在这里
看个乐呵,其实还是要面向需求,这个概念起码16/17年就有了,隔壁rust有用,folly::Poly没听说有谁用
省流,约束自己的,编译器足够聪明能分析出优化点,用const指引没啥帮助
看看作者给的B代码
uint64_t modulus = 1ULL << 31; // 2^31
// static uint64_t modulus = 1ULL << 31; // 2^31
uint64_t loop(uint64_t N, uint64_t S, uint64_t P, uint64_t Q) {
for (uint64_t i = 0; i < N; i++) {
S = (S*P+Q) % modulus;
}
return S;
}
把modulus改成static就内联了。
不是哥们,你直接用宏/constexpr得了呗,又不改动,全局变量影响多大没有数?
template<typename T>
class shared_ptr {
ctrl_blk_base* ctrl_blk_{};
T* t_{};
shared_ptr(ctrl_blk_with_storage<T>* cb)
: shared_ptr{cb, cb->get()}
{}
shared_ptr(ctrl_blk_base* cb, T* t)
: ctrl_blk_{cb}
, t_{t}
{}
template<typename U, typename... Args>
friend shared_ptr<U> make_shared(Args&&... vals);
public:
shared_ptr() = default;
shared_ptr(T* t)
: shared_ptr{new ctrl_blk<T>{t}, t}
{}
~shared_ptr()
{
if(ctrl_blk_) { ctrl_blk_->release_shared(); }
}
shared_ptr(const shared_ptr& rhs)
: ctrl_blk_{rhs.ctrl_blk_}
, t_{rhs.t_}
{
if(ctrl_blk_) { ctrl_blk_->add_shared(); }
}
shared_ptr(shared_ptr&& rhs)
: ctrl_blk_{rhs.ctrl_blk_}
, t_{rhs.t_}
{
rhs.ctrl_blk_ = nullptr;
rhs.t_ = nullptr;
}
shared_ptr& operator=(const shared_ptr& rhs)
{
shared_ptr{rhs}.swap(*this); // forward to copy ctor
return *this;
}
shared_ptr& operator=(shared_ptr&& rhs)
{
shared_ptr{std::move(rhs)}.swap(*this); // forward to move-ctor
return *this;
}
void swap(shared_ptr& rhs)
{
std::swap(t_, rhs.t_);
std::swap(ctrl_blk_, rhs.ctrl_blk_);
}
};
template<typename T, typename... Args>
shared_ptr<T> make_shared(Args&&... vals)
{
return new ctrl_blk_with_storage<T>(std::forward<Args>(vals)...);
}
struct ctrl_blk_base {
std::atomic_uint64_t shared_ref_count_{1};
void add_shared() { ++shared_ref_count_; }
auto dec() { return --shared_ref_count_; }
virtual void release_shared() = 0;
};
template<typename T>
struct ctrl_blk : ctrl_blk_base {
T* data_;
explicit ctrl_blk(T* data)
: ctrl_blk_base{}
, data_{data}
{}
void release_shared() override
{
if(0 == dec()) {
delete data_;
delete this; // self delete
}
}
};
template<typename T>
struct ctrl_blk_with_storage : ctrl_blk_base {
T in_place_;
template<typename... Args>
explicit ctrl_blk_with_storage(Args&&... vals)
: ctrl_blk_base{}
, in_place_{std::forward<Args>(vals)...}
{}
T* get() { return &in_place_; }
void release_shared() override
{
if(0 == dec()) {
delete this; // self delete
}
}
};
非常简单,大家看懂了吗
llvm引入了新的sanitizer, RTSan 标记了noblocking的函数只要监测到路径中存在 malloc, free, pthread_mutex_lock,就会报错
看样例
#include <vector>
void violation() [[clang::nonblocking]]{
std::vector<float> v;
v.resize(100);
}
int main() {
violation();
return 0;
}
//clang++ -fsanitize=realtime -g example_realtime_violation.cpp
输出
clang++ -fsanitize=realtime -g example_realtime_violation.cpp
./a.out
Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace:
0 0x000102893034 in __rtsan::PrintStackTrace() rtsan_stack.cpp:45
1 0x000102892e64 in __rtsan::Context::ExpectNotRealtime(char const*) rtsan_context.cpp:78
2 0x00010289397c in malloc rtsan_interceptors.cpp:286
3 0x000195bd7bd0 in operator new(unsigned long)+0x1c (libc++abi.dylib:arm64+0x16bd0)
4 0x5c7f00010230f07c (<unknown module>)
5 0x00010230f058 in std::__1::__libcpp_allocate[abi:ue170006](unsigned long, unsigned long) new:324
6 0x00010230effc in std::__1::allocator<float>::allocate[abi:ue170006](unsigned long) allocator.h:114
... snip ...
10 0x00010230e4bc in std::__1::vector<float, std::__1::allocator<float>>::__append(unsigned long) vector:1162
11 0x00010230dcdc in std::__1::vector<float, std::__1::allocator<float>>::resize(unsigned long) vector:1981
12 0x00010230dc28 in violation() main.cpp:5
13 0x00010230dd64 in main main.cpp:9
14 0x0001958960dc (<unknown module>)
15 0x2f557ffffffffffc (<unknown module>)
还是很准的,可以更好的控制快速路径中的可能阻塞的函数调用
如何使用?最新llvm
cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="compiler-rt" <path to source>/llvm
lemire新活,
并不需要那么的精确,可以使用brain float 16 显然计算带宽四倍,如果用上SIMD,那速度可以更快
#include <immintrin.h>
#include <cstddef>
#include <cstdint>
void to_float16(uint16_t *dst, const double *src, size_t length) {
size_t i = 0;
__mmask8 mask;
// Process 8 elements at a time
for (; i + 7 < length; i += 8) {
// Load 8 double-precision floats
__m512d src_vec = _mm512_loadu_pd(&src[i]);
// Convert to 16-bit floats with rounding
__m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
// Store the result
_mm_storeu_si128((__m128i*)&dst[i], *(__m128i*)&dst_vec);
}
// Handle remaining elements
if (i < length) {
// Create a mask for the remaining elements
mask = (1 << (length - i)) - 1;
// Load remaining double-precision floats
__m512d src_vec = _mm512_maskz_loadu_pd(mask, &src[i]);
// Convert to 16-bit floats with rounding
__m128bh dst_vec = _mm256_cvtneps_pbh(_mm512_cvt_roundpd_ps(src_vec, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
// Store the result with masking
_mm_mask_storeu_epi16(&dst[i], mask, *(__m128i*)&dst_vec);
}
}
void from_float16(double *dst, const uint16_t *src, size_t length) {
size_t i = 0;
__mmask8 mask;
// Process 8 elements at a time
for (; i + 7 < length; i += 8) {
// Load 8 half-precision floats
__m128i src_vec = _mm_loadu_si128((__m128i*)&src[i]);
// Convert to double-precision floats
__m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps(*(__m128bh*)&src_vec));
// Store the result
_mm512_storeu_pd(&dst[i], dst_vec);
}
// Handle remaining elements
if (i < length) {
// Create a mask for the remaining elements
mask = (1 << (length - i)) - 1;
// Load remaining half-precision floats
__m128i src_vec = _mm_maskz_loadu_epi16(mask, &src[i]);
// Convert to double-precision floats
__m512d dst_vec = _mm512_cvtps_pd(_mm256_cvtpbh_ps( *(__m128bh*)&src_vec));
// Store the result with masking
_mm512_mask_storeu_pd(&dst[i], mask, dst_vec);
}
}
作者测试压缩2亿条每秒,解压0.9亿条每秒
考虑geo/AI之类的不精确场景,这种压缩带来的提升是非常迅速的
- asteria 一个脚本语言,可嵌入,长期找人,希望胖友们帮帮忙,也可以加群753302367和作者对线
一个c的enum to string
#include <stdio.h>
#include <string.h>
#define NUMARGS(...) (sizeof((int[]){__VA_ARGS__})/sizeof(int))
#define ENUM_TO_STRING(ENUM_NAME, ...) \
enum ENUM_NAME { __VA_ARGS__ }; \
char ENUM_NAME##_strings[] = #__VA_ARGS__ ; \
long ENUM_NAME##strings_indices[NUMARGS(__VA_ARGS__)]; \
char *ENUM_NAME##_to_string(enum ENUM_NAME value) { \
static int init = 0; \
if(init == 0){ \
int n = 0; \
ENUM_NAME##strings_indices[n++] = 0; \
char* curr_pos = strchr(ENUM_NAME##_strings,','); \
while(curr_pos){ \
*curr_pos = '\0'; \
ENUM_NAME##strings_indices[n++]= (++curr_pos - ENUM_NAME##_strings); \
curr_pos = strchr(curr_pos,','); \
} \
init++; \
} \
return (char *)ENUM_NAME##_strings+ENUM_NAME##strings_indices[value]; \
}
/* Usage just create the enum */
ENUM_TO_STRING(Color,RED,GREEN,BLUE,VIOLET)
int main(void)
{
printf("%s\n",Color_to_string(RED));
printf("%s\n",Color_to_string(BLUE));
printf("%s\n",Color_to_string(GREEN));
printf("%s\n",Color_to_string(VIOLET));
printf("%s\n",Color_to_string(GREEN));
printf("%s\n",Color_to_string(BLUE));
return 0;
}
c++ 有magic_enum,别学哈
最近感觉文章越来越少了,重复度越来越高,难道说我的周刊到瓶颈了?已经通过剥削群友的方式拿到了供稿,又有东西可以发了