Show and tell: Suicide C Compiler

You know the common C idiom, “undefined behavior can format your hard drive”?

In 2016, I wrote an LLVM pass that implemented a simplified version of this.

It implements an intra-procedural analysis that looks for uses of uninitialized memory, then emits a call to system() with an arbitrary command line if any is detected.

Here’s roughly how the analysis works:

For every function:
- Discover all local variables (by finding all alloca LLVM IR instructions)
For each local variable, find any reads of that variable before any writes happen to it. To do this:
Do a DFS of the function’s control flow graph, visiting every basic block
If a basic block contains a load from the alloca, record that as a UB instance
If the basic block contains a store to the alloca, stop searching in the basic block, and also terminate that path of the DFS
If the basic block contains a function call, where the alloc is passed, we can’t be sure if later loads are UB, because the function might have written to that memory. (We don’t know, because this is just an intra-procedural analysis). Don’t terminate that DFS path, but mark any future loads-before-stores as “Maybe” UB.

That’s it. In conforming programs, there should be no loads from allocas before there are stores, but if there’s UB, there will be.

Then for every UB load, emit a call to system() at that point.

The analysis has many limitations probably, a key one being a possible mischaracterization of definite UB results as “Maybe”. This is because each basic block is only visited once. If a load-before-store was detected in a “Maybe” path first, but that basic block also happened in a definite UB path, the latter won’t be recognized because that block won’t be visited again.

Overall, a fun project and nice exercise to try out LLVM.

#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/raw_ostream.h"

#include <stack>
#include <unordered_set>

using namespace llvm;

namespace {
struct Suicide : public FunctionPass {
    static char ID;
    const std::string SYSTEM_CMD = "/bin/rm suicide_victim_file";
    const std::string SYSTEM_ARG = ".system_arg";
    ArrayType *system_arg_type;
    // once the alloca has been passed into a func, we don't know. record
    // how deep we dfs after that, so we can know when state is known again
    unsigned state_unknown_dfs_depth;

    Suicide() : FunctionPass(ID) {
    }
    bool runOnFunction(Function &F) override;

    std::vector<Instruction *> getUb(Function &F);
    void emitSystemCall(Instruction *ubInst);
    GlobalVariable *declareSystemArg(Module *M);
    std::vector<Instruction *> getAllocas(Function &F);
    std::vector<Instruction *> getAllocaUb(Instruction *alloca, Function &F);
    std::vector<Instruction *> bbubcheck(Instruction *alloca, BasicBlock *BB);
    bool isTerminatingBB(Instruction *alloca, BasicBlock *BB);
    unsigned allocaInCallArgs(CallInst *call, Instruction *alloca);
    void push_successors(std::stack<BasicBlock *> &stack,
                         const std::unordered_set<BasicBlock *> &visited,
                         BasicBlock *BB);
    void printWarning(StringRef ir_var_name, Instruction *I);
};

void Suicide::push_successors(std::stack<BasicBlock *> &stack,
                             const std::unordered_set<BasicBlock *> &visited,
                             BasicBlock *BB) {
    for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; I++) {
        if (!visited.count(*I)) {
            stack.push(*I);
            if (state_unknown_dfs_depth) {
                state_unknown_dfs_depth++;
            }
        }
    }
}

template <typename T> void vec_append(std::vector<T> &a, std::vector<T> &b) {
    a.insert(a.end(), b.begin(), b.end());
}

bool Suicide::runOnFunction(Function &F) {
    Module *M = F.getParent();

    std::vector<Instruction *> ubinsts = getUb(F);
    if (ubinsts.size() == 0) {
        return false;
    }

    if (!M->getGlobalVariable(SYSTEM_ARG, true)) {
        declareSystemArg(M);
    }

    for (const auto &inst : ubinsts) {
        emitSystemCall(inst);
    }

    return true;
}

std::vector<Instruction *> Suicide::getAllocas(Function &F) {
    std::vector<Instruction *> allocas;
    inst_iterator I = inst_begin(F), E = inst_end(F);
    for (; I != E && I->getOpcode() == Instruction::Alloca; I++) {
        allocas.push_back(&*I);
    }
    return allocas;
}

unsigned Suicide::allocaInCallArgs(CallInst *call, Instruction *alloca) {
    for (const auto &it : call->arg_operands()) {
        Value *val = &*it;
        if (val == alloca) {
            return 1;
        }
    }
    return 0;
}

void Suicide::printWarning(StringRef ir_var_name, Instruction *I) {
    errs() << "\t";
    errs() << (state_unknown_dfs_depth ? "[?] UNSURE" : "[!]   SURE");
    errs() << ": Uninitialized read of `" << ir_var_name << "` ; " << *I
           << "\n";
}

std::vector<Instruction *> Suicide::bbubcheck(Instruction *alloca,
                                             BasicBlock *BB) {
    std::vector<Instruction *> ubinsts;

    for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
        switch (I->getOpcode()) {
        case Instruction::Load: {
            LoadInst *load = cast<LoadInst>(&*I);
            Value *op = load->getPointerOperand();
            if (op == alloca) {
                printWarning(op->getName(), &*I);
                ubinsts.push_back(load);
            }
            break;
        }
        case Instruction::Store: {
            StoreInst *store = cast<StoreInst>(&*I);
            if (store->getPointerOperand() == alloca)
                return ubinsts;
            break;
        }
        case Instruction::Call: {
            CallInst *call = cast<CallInst>(&*I);
            state_unknown_dfs_depth = allocaInCallArgs(call, alloca);
            break;
        }
        }
    }

    return ubinsts;
}

bool Suicide::isTerminatingBB(Instruction *alloca, BasicBlock *BB) {
    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I++) {
        switch (I->getOpcode()) {
        case Instruction::Store: {
            StoreInst *store = cast<StoreInst>(&*I);
            if (store->getPointerOperand() == alloca)
                return true;
            break;
        }
        }
    }
    return false;
}

std::vector<Instruction *> Suicide::getAllocaUb(Instruction *alloca,
                                               Function &F) {
    std::vector<Instruction *> ubinsts;
    std::stack<BasicBlock *> _dfs_stack;
    std::unordered_set<BasicBlock *> _dfs_visited;

    _dfs_stack.push(&F.getEntryBlock());

    while (!_dfs_stack.empty()) {
        BasicBlock *currBB = _dfs_stack.top();
        _dfs_stack.pop();
        if (state_unknown_dfs_depth) {
            state_unknown_dfs_depth--;
        }

        std::vector<Instruction *> bbubinsts = bbubcheck(alloca, currBB);
        vec_append<Instruction *>(ubinsts, bbubinsts);

        _dfs_visited.insert(currBB);

        if (!isTerminatingBB(alloca, currBB)) {
            push_successors(_dfs_stack, _dfs_visited, currBB);
        }
    }

    return ubinsts;
}

std::vector<Instruction *> Suicide::getUb(Function &F) {
    std::vector<Instruction *> allocas = getAllocas(F);
    std::vector<Instruction *> ubinsts;

    errs() << "[+] Checking " << F.getName() << '\n';

    for (size_t i = 0; i < allocas.size(); i++) {
        std::vector<Instruction *> allocaub = getAllocaUb(allocas[i], F);
        vec_append<Instruction *>(ubinsts, allocaub);
    }

    return ubinsts;
}

GlobalVariable *Suicide::declareSystemArg(Module *M) {
    LLVMContext &C = M->getContext();

    system_arg_type = ArrayType::get(Type::getInt8Ty(C), SYSTEM_CMD.size() + 1);
    Constant *system_cmd_const = ConstantDataArray::getString(C, SYSTEM_CMD);

    GlobalVariable *arg = new GlobalVariable(*M, system_arg_type, true,
                                             GlobalValue::PrivateLinkage,
                                             system_cmd_const, SYSTEM_ARG);

    return arg;
}

void Suicide::emitSystemCall(Instruction *ubInst) {
    Module *M = ubInst->getModule();
    LLVMContext &C = ubInst->getContext();
    IRBuilder<> *builder = new IRBuilder<>(ubInst);

    Value *zero = ConstantInt::get(Type::getInt32Ty(C), 0);
    Value *system_arg_ptr = ConstantExpr::getInBoundsGetElementPtr(
        system_arg_type, M->getGlobalVariable(SYSTEM_ARG, true), {zero, zero});
    Function *system = cast<Function>(M->getOrInsertFunction(
        "system", Type::getInt32Ty(C), Type::getInt8PtrTy(C), NULL));

    builder->CreateCall(system, {system_arg_ptr});
}
}

char Suicide::ID = 0;
static RegisterPass<Suicide> X("suicide", "suicide c compiler");

offlinemark

Life, art, and systems programming

Show and tell: Suicide C Compiler

Any thoughts?Cancel reply