root/kernel/fs.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. readsb
  2. fsinit
  3. bzero
  4. balloc
  5. bfree
  6. iinit
  7. ialloc
  8. iupdate
  9. iget
  10. idup
  11. ilock
  12. iunlock
  13. iput
  14. iunlockput
  15. bmap
  16. itrunc
  17. stati
  18. readi
  19. writei
  20. namecmp
  21. dirlookup
  22. dirlink
  23. skipelem
  24. namex
  25. namei
  26. nameiparent

   1 // File system implementation.  Five layers:
   2 //   + Blocks: allocator for raw disk blocks.
   3 //   + Log: crash recovery for multi-step updates.
   4 //   + Files: inode allocator, reading, writing, metadata.
   5 //   + Directories: inode with special contents (list of other inodes!)
   6 //   + Names: paths like /usr/rtm/xv6/fs.c for convenient naming.
   7 //
   8 // This file contains the low-level file system manipulation
   9 // routines.  The (higher-level) system call implementations
  10 // are in sysfile.c.
  11 
  12 #include "types.h"
  13 #include "riscv.h"
  14 #include "defs.h"
  15 #include "param.h"
  16 #include "stat.h"
  17 #include "spinlock.h"
  18 #include "proc.h"
  19 #include "sleeplock.h"
  20 #include "fs.h"
  21 #include "buf.h"
  22 #include "file.h"
  23 
  24 #define min(a, b) ((a) < (b) ? (a) : (b))
  25 // there should be one superblock per disk device, but we run with
  26 // only one device
  27 struct superblock sb; 
  28 
  29 // Read the super block.
  30 static void
  31 readsb(int dev, struct superblock *sb)
  32 {
  33   struct buf *bp;
  34 
  35   bp = bread(dev, 1);
  36   memmove(sb, bp->data, sizeof(*sb));
  37   brelse(bp);
  38 }
  39 
  40 // Init fs
  41 void
  42 fsinit(int dev) {
  43   readsb(dev, &sb);
  44   if(sb.magic != FSMAGIC)
  45     panic("invalid file system");
  46   initlog(dev, &sb);
  47 }
  48 
  49 // Zero a block.
  50 static void
  51 bzero(int dev, int bno)
  52 {
  53   struct buf *bp;
  54 
  55   bp = bread(dev, bno);
  56   memset(bp->data, 0, BSIZE);
  57   log_write(bp);
  58   brelse(bp);
  59 }
  60 
  61 // Blocks.
  62 
  63 // Allocate a zeroed disk block.
  64 // returns 0 if out of disk space.
  65 static uint
  66 balloc(uint dev)
  67 {
  68   int b, bi, m;
  69   struct buf *bp;
  70 
  71   bp = 0;
  72   for(b = 0; b < sb.size; b += BPB){
  73     bp = bread(dev, BBLOCK(b, sb));
  74     for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
  75       m = 1 << (bi % 8);
  76       if((bp->data[bi/8] & m) == 0){  // Is block free?
  77         bp->data[bi/8] |= m;  // Mark block in use.
  78         log_write(bp);
  79         brelse(bp);
  80         bzero(dev, b + bi);
  81         return b + bi;
  82       }
  83     }
  84     brelse(bp);
  85   }
  86   printf("balloc: out of blocks\n");
  87   return 0;
  88 }
  89 
  90 // Free a disk block.
  91 static void
  92 bfree(int dev, uint b)
  93 {
  94   struct buf *bp;
  95   int bi, m;
  96 
  97   bp = bread(dev, BBLOCK(b, sb));
  98   bi = b % BPB;
  99   m = 1 << (bi % 8);
 100   if((bp->data[bi/8] & m) == 0)
 101     panic("freeing free block");
 102   bp->data[bi/8] &= ~m;
 103   log_write(bp);
 104   brelse(bp);
 105 }
 106 
 107 // Inodes.
 108 //
 109 // An inode describes a single unnamed file.
 110 // The inode disk structure holds metadata: the file's type,
 111 // its size, the number of links referring to it, and the
 112 // list of blocks holding the file's content.
 113 //
 114 // The inodes are laid out sequentially on disk at block
 115 // sb.inodestart. Each inode has a number, indicating its
 116 // position on the disk.
 117 //
 118 // The kernel keeps a table of in-use inodes in memory
 119 // to provide a place for synchronizing access
 120 // to inodes used by multiple processes. The in-memory
 121 // inodes include book-keeping information that is
 122 // not stored on disk: ip->ref and ip->valid.
 123 //
 124 // An inode and its in-memory representation go through a
 125 // sequence of states before they can be used by the
 126 // rest of the file system code.
 127 //
 128 // * Allocation: an inode is allocated if its type (on disk)
 129 //   is non-zero. ialloc() allocates, and iput() frees if
 130 //   the reference and link counts have fallen to zero.
 131 //
 132 // * Referencing in table: an entry in the inode table
 133 //   is free if ip->ref is zero. Otherwise ip->ref tracks
 134 //   the number of in-memory pointers to the entry (open
 135 //   files and current directories). iget() finds or
 136 //   creates a table entry and increments its ref; iput()
 137 //   decrements ref.
 138 //
 139 // * Valid: the information (type, size, &c) in an inode
 140 //   table entry is only correct when ip->valid is 1.
 141 //   ilock() reads the inode from
 142 //   the disk and sets ip->valid, while iput() clears
 143 //   ip->valid if ip->ref has fallen to zero.
 144 //
 145 // * Locked: file system code may only examine and modify
 146 //   the information in an inode and its content if it
 147 //   has first locked the inode.
 148 //
 149 // Thus a typical sequence is:
 150 //   ip = iget(dev, inum)
 151 //   ilock(ip)
 152 //   ... examine and modify ip->xxx ...
 153 //   iunlock(ip)
 154 //   iput(ip)
 155 //
 156 // ilock() is separate from iget() so that system calls can
 157 // get a long-term reference to an inode (as for an open file)
 158 // and only lock it for short periods (e.g., in read()).
 159 // The separation also helps avoid deadlock and races during
 160 // pathname lookup. iget() increments ip->ref so that the inode
 161 // stays in the table and pointers to it remain valid.
 162 //
 163 // Many internal file system functions expect the caller to
 164 // have locked the inodes involved; this lets callers create
 165 // multi-step atomic operations.
 166 //
 167 // The itable.lock spin-lock protects the allocation of itable
 168 // entries. Since ip->ref indicates whether an entry is free,
 169 // and ip->dev and ip->inum indicate which i-node an entry
 170 // holds, one must hold itable.lock while using any of those fields.
 171 //
 172 // An ip->lock sleep-lock protects all ip-> fields other than ref,
 173 // dev, and inum.  One must hold ip->lock in order to
 174 // read or write that inode's ip->valid, ip->size, ip->type, &c.
 175 
 176 struct {
 177   struct spinlock lock;
 178   struct inode inode[NINODE];
 179 } itable;
 180 
 181 void
 182 iinit()
 183 {
 184   int i = 0;
 185   
 186   initlock(&itable.lock, "itable");
 187   for(i = 0; i < NINODE; i++) {
 188     initsleeplock(&itable.inode[i].lock, "inode");
 189   }
 190 }
 191 
 192 static struct inode* iget(uint dev, uint inum);
 193 
 194 // Allocate an inode on device dev.
 195 // Mark it as allocated by  giving it type type.
 196 // Returns an unlocked but allocated and referenced inode,
 197 // or NULL if there is no free inode.
 198 struct inode*
 199 ialloc(uint dev, short type)
 200 {
 201   int inum;
 202   struct buf *bp;
 203   struct dinode *dip;
 204 
 205   for(inum = 1; inum < sb.ninodes; inum++){
 206     bp = bread(dev, IBLOCK(inum, sb));
 207     dip = (struct dinode*)bp->data + inum%IPB;
 208     if(dip->type == 0){  // a free inode
 209       memset(dip, 0, sizeof(*dip));
 210       dip->type = type;
 211       log_write(bp);   // mark it allocated on the disk
 212       brelse(bp);
 213       return iget(dev, inum);
 214     }
 215     brelse(bp);
 216   }
 217   printf("ialloc: no inodes\n");
 218   return 0;
 219 }
 220 
 221 // Copy a modified in-memory inode to disk.
 222 // Must be called after every change to an ip->xxx field
 223 // that lives on disk.
 224 // Caller must hold ip->lock.
 225 void
 226 iupdate(struct inode *ip)
 227 {
 228   struct buf *bp;
 229   struct dinode *dip;
 230 
 231   bp = bread(ip->dev, IBLOCK(ip->inum, sb));
 232   dip = (struct dinode*)bp->data + ip->inum%IPB;
 233   dip->type = ip->type;
 234   dip->major = ip->major;
 235   dip->minor = ip->minor;
 236   dip->nlink = ip->nlink;
 237   dip->size = ip->size;
 238   memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
 239   log_write(bp);
 240   brelse(bp);
 241 }
 242 
 243 // Find the inode with number inum on device dev
 244 // and return the in-memory copy. Does not lock
 245 // the inode and does not read it from disk.
 246 static struct inode*
 247 iget(uint dev, uint inum)
 248 {
 249   struct inode *ip, *empty;
 250 
 251   acquire(&itable.lock);
 252 
 253   // Is the inode already in the table?
 254   empty = 0;
 255   for(ip = &itable.inode[0]; ip < &itable.inode[NINODE]; ip++){
 256     if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
 257       ip->ref++;
 258       release(&itable.lock);
 259       return ip;
 260     }
 261     if(empty == 0 && ip->ref == 0)    // Remember empty slot.
 262       empty = ip;
 263   }
 264 
 265   // Recycle an inode entry.
 266   if(empty == 0)
 267     panic("iget: no inodes");
 268 
 269   ip = empty;
 270   ip->dev = dev;
 271   ip->inum = inum;
 272   ip->ref = 1;
 273   ip->valid = 0;
 274   release(&itable.lock);
 275 
 276   return ip;
 277 }
 278 
 279 // Increment reference count for ip.
 280 // Returns ip to enable ip = idup(ip1) idiom.
 281 struct inode*
 282 idup(struct inode *ip)
 283 {
 284   acquire(&itable.lock);
 285   ip->ref++;
 286   release(&itable.lock);
 287   return ip;
 288 }
 289 
 290 // Lock the given inode.
 291 // Reads the inode from disk if necessary.
 292 void
 293 ilock(struct inode *ip)
 294 {
 295   struct buf *bp;
 296   struct dinode *dip;
 297 
 298   if(ip == 0 || ip->ref < 1)
 299     panic("ilock");
 300 
 301   acquiresleep(&ip->lock);
 302 
 303   if(ip->valid == 0){
 304     bp = bread(ip->dev, IBLOCK(ip->inum, sb));
 305     dip = (struct dinode*)bp->data + ip->inum%IPB;
 306     ip->type = dip->type;
 307     ip->major = dip->major;
 308     ip->minor = dip->minor;
 309     ip->nlink = dip->nlink;
 310     ip->size = dip->size;
 311     memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
 312     brelse(bp);
 313     ip->valid = 1;
 314     if(ip->type == 0)
 315       panic("ilock: no type");
 316   }
 317 }
 318 
 319 // Unlock the given inode.
 320 void
 321 iunlock(struct inode *ip)
 322 {
 323   if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
 324     panic("iunlock");
 325 
 326   releasesleep(&ip->lock);
 327 }
 328 
 329 // Drop a reference to an in-memory inode.
 330 // If that was the last reference, the inode table entry can
 331 // be recycled.
 332 // If that was the last reference and the inode has no links
 333 // to it, free the inode (and its content) on disk.
 334 // All calls to iput() must be inside a transaction in
 335 // case it has to free the inode.
 336 void
 337 iput(struct inode *ip)
 338 {
 339   acquire(&itable.lock);
 340 
 341   if(ip->ref == 1 && ip->valid && ip->nlink == 0){
 342     // inode has no links and no other references: truncate and free.
 343 
 344     // ip->ref == 1 means no other process can have ip locked,
 345     // so this acquiresleep() won't block (or deadlock).
 346     acquiresleep(&ip->lock);
 347 
 348     release(&itable.lock);
 349 
 350     itrunc(ip);
 351     ip->type = 0;
 352     iupdate(ip);
 353     ip->valid = 0;
 354 
 355     releasesleep(&ip->lock);
 356 
 357     acquire(&itable.lock);
 358   }
 359 
 360   ip->ref--;
 361   release(&itable.lock);
 362 }
 363 
 364 // Common idiom: unlock, then put.
 365 void
 366 iunlockput(struct inode *ip)
 367 {
 368   iunlock(ip);
 369   iput(ip);
 370 }
 371 
 372 // Inode content
 373 //
 374 // The content (data) associated with each inode is stored
 375 // in blocks on the disk. The first NDIRECT block numbers
 376 // are listed in ip->addrs[].  The next NINDIRECT blocks are
 377 // listed in block ip->addrs[NDIRECT].
 378 
 379 // Return the disk block address of the nth block in inode ip.
 380 // If there is no such block, bmap allocates one.
 381 // returns 0 if out of disk space.
 382 static uint
 383 bmap(struct inode *ip, uint bn)
 384 {
 385   uint addr, *a;
 386   struct buf *bp;
 387 
 388   if(bn < NDIRECT){
 389     if((addr = ip->addrs[bn]) == 0){
 390       addr = balloc(ip->dev);
 391       if(addr == 0)
 392         return 0;
 393       ip->addrs[bn] = addr;
 394     }
 395     return addr;
 396   }
 397   bn -= NDIRECT;
 398 
 399   if(bn < NINDIRECT){
 400     // Load indirect block, allocating if necessary.
 401     if((addr = ip->addrs[NDIRECT]) == 0){
 402       addr = balloc(ip->dev);
 403       if(addr == 0)
 404         return 0;
 405       ip->addrs[NDIRECT] = addr;
 406     }
 407     bp = bread(ip->dev, addr);
 408     a = (uint*)bp->data;
 409     if((addr = a[bn]) == 0){
 410       addr = balloc(ip->dev);
 411       if(addr){
 412         a[bn] = addr;
 413         log_write(bp);
 414       }
 415     }
 416     brelse(bp);
 417     return addr;
 418   }
 419 
 420   panic("bmap: out of range");
 421 }
 422 
 423 // Truncate inode (discard contents).
 424 // Caller must hold ip->lock.
 425 void
 426 itrunc(struct inode *ip)
 427 {
 428   int i, j;
 429   struct buf *bp;
 430   uint *a;
 431 
 432   for(i = 0; i < NDIRECT; i++){
 433     if(ip->addrs[i]){
 434       bfree(ip->dev, ip->addrs[i]);
 435       ip->addrs[i] = 0;
 436     }
 437   }
 438 
 439   if(ip->addrs[NDIRECT]){
 440     bp = bread(ip->dev, ip->addrs[NDIRECT]);
 441     a = (uint*)bp->data;
 442     for(j = 0; j < NINDIRECT; j++){
 443       if(a[j])
 444         bfree(ip->dev, a[j]);
 445     }
 446     brelse(bp);
 447     bfree(ip->dev, ip->addrs[NDIRECT]);
 448     ip->addrs[NDIRECT] = 0;
 449   }
 450 
 451   ip->size = 0;
 452   iupdate(ip);
 453 }
 454 
 455 // Copy stat information from inode.
 456 // Caller must hold ip->lock.
 457 void
 458 stati(struct inode *ip, struct stat *st)
 459 {
 460   st->dev = ip->dev;
 461   st->ino = ip->inum;
 462   st->type = ip->type;
 463   st->nlink = ip->nlink;
 464   st->size = ip->size;
 465 }
 466 
 467 // Read data from inode.
 468 // Caller must hold ip->lock.
 469 // If user_dst==1, then dst is a user virtual address;
 470 // otherwise, dst is a kernel address.
 471 int
 472 readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n)
 473 {
 474   uint tot, m;
 475   struct buf *bp;
 476 
 477   if(off > ip->size || off + n < off)
 478     return 0;
 479   if(off + n > ip->size)
 480     n = ip->size - off;
 481 
 482   for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
 483     uint addr = bmap(ip, off/BSIZE);
 484     if(addr == 0)
 485       break;
 486     bp = bread(ip->dev, addr);
 487     m = min(n - tot, BSIZE - off%BSIZE);
 488     if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) {
 489       brelse(bp);
 490       tot = -1;
 491       break;
 492     }
 493     brelse(bp);
 494   }
 495   return tot;
 496 }
 497 
 498 // Write data to inode.
 499 // Caller must hold ip->lock.
 500 // If user_src==1, then src is a user virtual address;
 501 // otherwise, src is a kernel address.
 502 // Returns the number of bytes successfully written.
 503 // If the return value is less than the requested n,
 504 // there was an error of some kind.
 505 int
 506 writei(struct inode *ip, int user_src, uint64 src, uint off, uint n)
 507 {
 508   uint tot, m;
 509   struct buf *bp;
 510 
 511   if(off > ip->size || off + n < off)
 512     return -1;
 513   if(off + n > MAXFILE*BSIZE)
 514     return -1;
 515 
 516   for(tot=0; tot<n; tot+=m, off+=m, src+=m){
 517     uint addr = bmap(ip, off/BSIZE);
 518     if(addr == 0)
 519       break;
 520     bp = bread(ip->dev, addr);
 521     m = min(n - tot, BSIZE - off%BSIZE);
 522     if(either_copyin(bp->data + (off % BSIZE), user_src, src, m) == -1) {
 523       brelse(bp);
 524       break;
 525     }
 526     log_write(bp);
 527     brelse(bp);
 528   }
 529 
 530   if(off > ip->size)
 531     ip->size = off;
 532 
 533   // write the i-node back to disk even if the size didn't change
 534   // because the loop above might have called bmap() and added a new
 535   // block to ip->addrs[].
 536   iupdate(ip);
 537 
 538   return tot;
 539 }
 540 
 541 // Directories
 542 
 543 int
 544 namecmp(const char *s, const char *t)
 545 {
 546   return strncmp(s, t, DIRSIZ);
 547 }
 548 
 549 // Look for a directory entry in a directory.
 550 // If found, set *poff to byte offset of entry.
 551 struct inode*
 552 dirlookup(struct inode *dp, char *name, uint *poff)
 553 {
 554   uint off, inum;
 555   struct dirent de;
 556 
 557   if(dp->type != T_DIR)
 558     panic("dirlookup not DIR");
 559 
 560   for(off = 0; off < dp->size; off += sizeof(de)){
 561     if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
 562       panic("dirlookup read");
 563     if(de.inum == 0)
 564       continue;
 565     if(namecmp(name, de.name) == 0){
 566       // entry matches path element
 567       if(poff)
 568         *poff = off;
 569       inum = de.inum;
 570       return iget(dp->dev, inum);
 571     }
 572   }
 573 
 574   return 0;
 575 }
 576 
 577 // Write a new directory entry (name, inum) into the directory dp.
 578 // Returns 0 on success, -1 on failure (e.g. out of disk blocks).
 579 int
 580 dirlink(struct inode *dp, char *name, uint inum)
 581 {
 582   int off;
 583   struct dirent de;
 584   struct inode *ip;
 585 
 586   // Check that name is not present.
 587   if((ip = dirlookup(dp, name, 0)) != 0){
 588     iput(ip);
 589     return -1;
 590   }
 591 
 592   // Look for an empty dirent.
 593   for(off = 0; off < dp->size; off += sizeof(de)){
 594     if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
 595       panic("dirlink read");
 596     if(de.inum == 0)
 597       break;
 598   }
 599 
 600   strncpy(de.name, name, DIRSIZ);
 601   de.inum = inum;
 602   if(writei(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de))
 603     return -1;
 604 
 605   return 0;
 606 }
 607 
 608 // Paths
 609 
 610 // Copy the next path element from path into name.
 611 // Return a pointer to the element following the copied one.
 612 // The returned path has no leading slashes,
 613 // so the caller can check *path=='\0' to see if the name is the last one.
 614 // If no name to remove, return 0.
 615 //
 616 // Examples:
 617 //   skipelem("a/bb/c", name) = "bb/c", setting name = "a"
 618 //   skipelem("///a//bb", name) = "bb", setting name = "a"
 619 //   skipelem("a", name) = "", setting name = "a"
 620 //   skipelem("", name) = skipelem("////", name) = 0
 621 //
 622 static char*
 623 skipelem(char *path, char *name)
 624 {
 625   char *s;
 626   int len;
 627 
 628   while(*path == '/')
 629     path++;
 630   if(*path == 0)
 631     return 0;
 632   s = path;
 633   while(*path != '/' && *path != 0)
 634     path++;
 635   len = path - s;
 636   if(len >= DIRSIZ)
 637     memmove(name, s, DIRSIZ);
 638   else {
 639     memmove(name, s, len);
 640     name[len] = 0;
 641   }
 642   while(*path == '/')
 643     path++;
 644   return path;
 645 }
 646 
 647 // Look up and return the inode for a path name.
 648 // If parent != 0, return the inode for the parent and copy the final
 649 // path element into name, which must have room for DIRSIZ bytes.
 650 // Must be called inside a transaction since it calls iput().
 651 static struct inode*
 652 namex(char *path, int nameiparent, char *name)
 653 {
 654   struct inode *ip, *next;
 655 
 656   if(*path == '/')
 657     ip = iget(ROOTDEV, ROOTINO);
 658   else
 659     ip = idup(myproc()->cwd);
 660 
 661   while((path = skipelem(path, name)) != 0){
 662     ilock(ip);
 663     if(ip->type != T_DIR){
 664       iunlockput(ip);
 665       return 0;
 666     }
 667     if(nameiparent && *path == '\0'){
 668       // Stop one level early.
 669       iunlock(ip);
 670       return ip;
 671     }
 672     if((next = dirlookup(ip, name, 0)) == 0){
 673       iunlockput(ip);
 674       return 0;
 675     }
 676     iunlockput(ip);
 677     ip = next;
 678   }
 679   if(nameiparent){
 680     iput(ip);
 681     return 0;
 682   }
 683   return ip;
 684 }
 685 
 686 struct inode*
 687 namei(char *path)
 688 {
 689   char name[DIRSIZ];
 690   return namex(path, 0, name);
 691 }
 692 
 693 struct inode*
 694 nameiparent(char *path, char *name)
 695 {
 696   return namex(path, 1, name);
 697 }

/* [<][>][^][v][top][bottom][index][help] */